In [1]:
import pandas as pd
import os
os.chdir(os.path.dirname(os.getcwd()))

%load_ext autoreload
%autoreload 2
%reload_ext autoreload


from resources.constants import *

pictures_df = pd.read_csv(PICTURE_TRIPLETS_CSV_PATH, sep=CSV_SEPARATOR)
outfits_df = pd.read_csv(OUTFITS_CSV_PATH, sep=CSV_SEPARATOR)
user_triplets_df = pd.read_csv(USER_ACTIVITY_TRIPLETS_CSV_PATH, sep=CSV_SEPARATOR)

# Ensure tags are lists
outfits_df["tag_categories"] = outfits_df["tag_categories"].apply(eval)
outfits_df["outfit_tags"] = outfits_df["outfit_tags"].apply(eval)

In [2]:
# Append orders from before 2020 to the evaluated dataset
original_orders_df = pd.read_csv(ORIGINAL_ORDERS_CSV_PATH, sep=CSV_SEPARATOR)
user_triplets_df = pd.concat([user_triplets_df, original_orders_df], ignore_index=True)

In [4]:
import src.load_baseline_resources
import pickle
from resources.constants import EMBEDDING_MODEL_DICT_PICKLE_PATH

loaded_embeddings_dict = src.load_baseline_resources.load_embeddings_from_folder()
pickle.dump(loaded_embeddings_dict, open(EMBEDDING_MODEL_DICT_PICKLE_PATH, "wb"))

# Loading embeddings is expensive due to file operations, so we save them to a pickle file. To recreate this pickle file, run the commented code above
loaded_embeddings_dict = pickle.load(open(EMBEDDING_MODEL_DICT_PICKLE_PATH, "rb"))

# Introduce the embeddings for each outfit, if the outfit has no embeddings, we drop it
pictures_df["embeddings"] = pictures_df["picture.id"].map(loaded_embeddings_dict)
outfit_pictures_df = pictures_df.groupby("outfit.id").agg({"picture.id": list, "embeddings": list}).reset_index()
outfits_df["embeddings"] = outfits_df["id"].map(outfit_pictures_df.set_index("outfit.id")["embeddings"])
outfits_df = outfits_df.dropna(subset=["embeddings"])

  0%|          | 0/50293 [00:00<?, ?it/s]

In [5]:
from src.prepare_train_test_splits import translate_user_triplets_to_orders, remove_consecutive_duplicates

# Convert triplets into entries for each individual user
user_triplets_df = remove_consecutive_duplicates(user_triplets_df)
user_triplets_df = user_triplets_df[user_triplets_df["outfit.id"].isin(outfits_df["id"])] # Remove triplets with no embeddings
user_orders_df = translate_user_triplets_to_orders(user_triplets_df, outfits_df)


4949


In [6]:
from src.prepare_train_test_splits import convert_user_orders_to_train_test_splits

user_splits_df, user_splits_unique_df = convert_user_orders_to_train_test_splits(user_orders_df, percentage_test=0.3)

No unique outfit found with groups ['group.4bd4ee24eac8948e82783b15d9404f6b'
 'group.4bd4ee24eac8948e82783b15d9404f6b']
No unique outfit found with groups ['group.423a23f6717e6d85adac54c051ee9832'
 'group.423a23f6717e6d85adac54c051ee9832']
No unique outfit found with groups ['group.e0cb0f6e113edc4df8a1e304376734f6'
 'group.e0cb0f6e113edc4df8a1e304376734f6']
No unique outfit found with groups ['group.384b8170c6a6ddfd568ff7fab5fb49c4'
 'group.384b8170c6a6ddfd568ff7fab5fb49c4']
No unique outfit found with groups ['group.edb60c2f440a9ac7d0883fb9371c8607'
 'group.edb60c2f440a9ac7d0883fb9371c8607']
No unique outfit found with groups ['group.a3ab26b5d2f7ef2cf102422a3dde3b46'
 'group.a3ab26b5d2f7ef2cf102422a3dde3b46']
No unique outfit found with groups ['group.2c7095c075561fe6278f3a2d7c1d6ac9'
 'group.2c7095c075561fe6278f3a2d7c1d6ac9']
No unique outfit found with groups ['group.ae8da3f0ad6f8ff3f83b2af96e975991'
 'group.ae8da3f0ad6f8ff3f83b2af96e975991']
No unique outfit found with groups ['gro

In [7]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

all_tags = outfits_df["outfit_tags"].values.tolist()
mlb = MultiLabelBinarizer()
one_hot_encoded = mlb.fit_transform(all_tags)
outfits_df["one_hot_encoded"] = [np.array(oh_list) for oh_list in one_hot_encoded.tolist()]

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim

# Load your dataframe (example)
# outfits_df = pd.read_csv("path_to_your_dataframe.csv")

# Assuming your dataframe has the following columns:
# "one_hot_encoded" and "mean_embeddings"
# Convert them to numpy arrays
def get_mean_embedding(embeddings):
    embeddings = np.array(embeddings)
    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding

def concatenate_embeddings(oh_embeddings, image_embeddings, oh_weighting):
    oh_embeddings = np.array(oh_embeddings) * oh_weighting
    return np.concatenate((oh_embeddings, image_embeddings))

outfits_df["mean_embeddings"] = outfits_df["embeddings"].apply(lambda x: get_mean_embedding(x))
#one_hot_encoded = np.array(outfits_df["one_hot_encoded"].tolist())
#mean_embeddings = np.array(outfits_df["mean_embeddings"].tolist())

outfits_df["concatenated_embeddings"] = outfits_df.apply(lambda x: concatenate_embeddings(x["one_hot_encoded"], x["mean_embeddings"], oh_weighting=4), axis=1)


# Converting lists to tensors is inefficient, so we convert them to numpy arrays first. Saves a couple of seconds.
input_embeddings = np.vstack(outfits_df["concatenated_embeddings"].values)
input_embeddings = input_embeddings.astype(np.float32)
input_embeddings = torch.tensor(input_embeddings) 

In [9]:
from tqdm.notebook import tqdm

class Autoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, latent_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

input_dim = input_embeddings.shape[1]
hidden_dim = 2048 
latent_dim = 512 

model = Autoencoder(input_dim, hidden_dim, latent_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 3
batch_size = 32

for epoch in tqdm(range(num_epochs)):
    permutation = torch.randperm(input_embeddings.size()[0])
    
    for i in range(0, input_embeddings.size()[0], batch_size):
        indices = permutation[i:i+batch_size]
        batch_inputs = input_embeddings[indices]

        encoded, decoded = model(batch_inputs)
        loss = criterion(decoded, batch_inputs)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Save the model
#torch.save(model.state_dict(), 'autoencoder_model.pth')


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch [1/3], Loss: 0.1819
Epoch [2/3], Loss: 0.1676
Epoch [3/3], Loss: 0.1680


In [10]:
def get_outfit_embeddings(outfits_df, model):
    one_hot_encoded = np.array(outfits_df["one_hot_encoded"].tolist())
    mean_embeddings = np.array(outfits_df["mean_embeddings"].tolist())
    input_embeddings = np.concatenate((one_hot_encoded, mean_embeddings), axis=1)
    input_embeddings = torch.tensor(input_embeddings, dtype=torch.float32)
    with torch.no_grad():
        encoded, decoded = model(input_embeddings)
    return encoded

outfit_embeddings = get_outfit_embeddings(outfits_df, model)
outfits_df["outfit_embeddings"] = [x.numpy() for x in outfit_embeddings]
print(np.stack(outfits_df["outfit_embeddings"].values).shape)

(15157, 512)


In [11]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
from tqdm.notebook import tqdm

NUM_ITEMS = 100

def find_rental_history_embeddings(outfit_ids, outfit_to_embedding_dict):
    outfit_ids = [outfit_id for outfit_id in outfit_ids if outfit_id != "nan"] # TODO: Find out where these nan values are coming from, only two of them for now, as far as I can tell.
    return [outfit_to_embedding_dict[outfit_id] for outfit_id in outfit_ids]

def get_mean_embedding(embeddings):
    embeddings = np.array(embeddings)
    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding

def get_nearest_neighbors_batch(embeddings, nn, num_items, index_to_id):
    distances, indices = nn.kneighbors(embeddings, n_neighbors=num_items+1)
    ids = [[index_to_id[i] for i in idx[1:]] for idx in indices]
    distances = [dist[1:] for dist in distances]
    return ids, distances


def predict_nearest_neighbors(df, outfits_df, embeddings_column="embeddings"):
    outfit_to_embedding_dict = outfits_df.set_index("id")[embeddings_column].to_dict()
    index_to_outfit_dict = {i: outfit_id for i, outfit_id in enumerate(outfits_df["id"].values)}
    group_to_embedding_dict = outfits_df.set_index("group")[embeddings_column].to_dict()
    index_to_group_dict = {i: group for i, group in enumerate(outfits_df["group"].values)}
    
    df["train_id_embeddings"] = df["train_outfit_ids"].apply(lambda x: find_rental_history_embeddings(x, outfit_to_embedding_dict))
    df["train_group_embeddings"] = df["train_group"].apply(lambda x: find_rental_history_embeddings(x, group_to_embedding_dict))

    df["rental_history_id_embedding"] = df["train_id_embeddings"].apply(lambda x: get_mean_embedding(x))
    df["rental_history_group_embedding"] = df["train_group_embeddings"].apply(lambda x: get_mean_embedding(x))

    nearest_neighbors = NearestNeighbors(n_neighbors=NUM_ITEMS+1, metric="cosine")
    embeddings = np.stack(outfits_df[embeddings_column].values)
    nearest_neighbors.fit(embeddings)

    id_embeddings = np.stack(df["rental_history_id_embedding"].values)
    group_embeddings = np.stack(df["rental_history_group_embedding"].values)

    id_predictions, id_distances = get_nearest_neighbors_batch(id_embeddings, nearest_neighbors, NUM_ITEMS, index_to_outfit_dict)
    group_predictions, group_distances = get_nearest_neighbors_batch(group_embeddings, nearest_neighbors, NUM_ITEMS, index_to_group_dict)

    df["id_prediction"], df["id_prediction_distances"] = id_predictions, id_distances
    df["group_prediction"], df["group_prediction_distances"] = group_predictions, group_distances
    
    return df

def predict_nearest_neighbors_images(df, outfits_df, embeddings_column="embeddings"):
    outfits_df["mean_embeddings"] = outfits_df[embeddings_column].apply(lambda x: get_mean_embedding(x))

    return predict_nearest_neighbors(df, outfits_df, embeddings_column="mean_embeddings")

# Apply to dataframes
tqdm.pandas()

METHOD = "Img Embed"

if METHOD == "Tag Embed":
    # Tag based predictions
    user_splits_df = predict_nearest_neighbors(user_splits_df, outfits_df, embeddings_column="one_hot_encoded")
    user_splits_unique_df = predict_nearest_neighbors(user_splits_unique_df, outfits_df, embeddings_column="one_hot_encoded")
elif METHOD == "Img Embed":
    # Image based predictions
    user_splits_df = predict_nearest_neighbors_images(user_splits_df, outfits_df, embeddings_column="embeddings")
    user_splits_unique_df = predict_nearest_neighbors_images(user_splits_unique_df, outfits_df, embeddings_column="embeddings")
elif METHOD == "Combined Embed":
    # Combined predictions
    user_splits_df = predict_nearest_neighbors(user_splits_df, outfits_df, embeddings_column="outfit_embeddings")
    user_splits_unique_df = predict_nearest_neighbors(user_splits_unique_df, outfits_df, embeddings_column="outfit_embeddings")
elif METHOD == "Concat Img Tag Embed":
    # Concat predictions
    user_splits_df = predict_nearest_neighbors(user_splits_df, outfits_df, embeddings_column="concatenated_embeddings")
    user_splits_unique_df = predict_nearest_neighbors(user_splits_unique_df, outfits_df, embeddings_column="concatenated_embeddings")

In [12]:
from IPython.display import display

def evaluate_hit_rate_at_n(test_id, predicted_ids, n=10):
    if predicted_ids is np.nan:
        print(f"None prediction for {test_id}!")
        return 0
    predicted_ids = predicted_ids[:n]
    if type(test_id) == str or type(test_id) == np.str_:
        if test_id in predicted_ids:
            #print(f"Hit at {n} for {test_id} in {predicted_ids}")
            return 1
    elif type(test_id) == list or type(test_id) == np.ndarray:
        for outfit_id in test_id:
            if outfit_id in predicted_ids:
                return 1
    else:
        raise ValueError(f"Unknown type {type(test_id)}")
    return 0

HIT_RATE_COLUMNS = ["id_hit_rate_at_100", "id_hit_rate_at_10", "group_hit_rate_at_100", "group_hit_rate_at_10"]
def evaluate_df_hit_rate_at_n(df, n=10):
    df["id_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["id_prediction"], n=100), axis=1)
    df["id_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["id_prediction"], n=10), axis=1)
    df["group_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_prediction"], n=100), axis=1)
    df["group_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_prediction"], n=10), axis=1)
    display(df[HIT_RATE_COLUMNS].mean())
    return df


user_splits_df = evaluate_df_hit_rate_at_n(user_splits_df, n=10)
user_splits_unique_df = evaluate_df_hit_rate_at_n(user_splits_unique_df, n=10)

id_hit_rate_at_100       0.133436
id_hit_rate_at_10        0.035549
group_hit_rate_at_100    0.144256
group_hit_rate_at_10     0.038640
dtype: float64

id_hit_rate_at_100       0.153051
id_hit_rate_at_10        0.034126
group_hit_rate_at_100    0.160807
group_hit_rate_at_10     0.036453
dtype: float64

In [13]:
import pyperclip

def format_dicts_into_latex(all_dict, ind_dict, precision=4, run_name="Random"):
    first_row = f"{run_name} Ind & {all_dict['id_hit_rate_at_10']:.{precision}f} & {all_dict['id_hit_rate_at_100']:.{precision}f} & {ind_dict['id_hit_rate_at_10']:.{precision}f} & {ind_dict['id_hit_rate_at_100']:.{precision}f} \\\\"
    second_row = f"{run_name} Groups & {all_dict['group_hit_rate_at_10']:.{precision}f} & {all_dict['group_hit_rate_at_100']:.{precision}f} & {ind_dict['group_hit_rate_at_10']:.{precision}f} & {ind_dict['group_hit_rate_at_100']:.{precision}f} \\\\\\hline"
    full_string = first_row + "\n" + second_row + "\n"
    print(full_string)
    pyperclip.copy(full_string)

all_dict = {column: user_splits_df[column].mean() for column in HIT_RATE_COLUMNS}
ind_dict = {column: user_splits_unique_df[column].mean() for column in HIT_RATE_COLUMNS}

format_dicts_into_latex(all_dict, ind_dict, precision=4, run_name=METHOD)

Img Embed Ind & 0.0355 & 0.1334 & 0.0341 & 0.1531 \\
Img Embed Groups & 0.0386 & 0.1443 & 0.0365 & 0.1608 \\\hline



In [14]:
import numpy as np

def get_outfit_category(tag_categories, tags, category):
    tag_categories, tags = np.array(tag_categories), np.array(tags)
    category_indexes = np.where(tag_categories == category)[0]
    if len(category_indexes) == 0:
        return ""
    cat_tags = tags[category_indexes]
    output = str(cat_tags[0])
    return output

outfits_df["size"] = outfits_df.apply(lambda x: get_outfit_category(x["tag_categories"], x["outfit_tags"], "Size"), axis=1)
