In [1]:
import pandas as pd
import numpy as np
import os
os.chdir(os.path.dirname(os.getcwd()))

%load_ext autoreload
%autoreload 2
%reload_ext autoreload


from resources.constants import *

pictures_df = pd.read_csv(PICTURE_TRIPLETS_CSV_PATH, sep=CSV_SEPARATOR)
outfits_df = pd.read_csv(OUTFITS_CSV_PATH, sep=CSV_SEPARATOR)
user_triplets_df = pd.read_csv(USER_ACTIVITY_TRIPLETS_CSV_PATH, sep=CSV_SEPARATOR)

# CSV files are read as strings, so we need to convert them to lists
outfits_df["tag_categories"] = outfits_df["tag_categories"].apply(eval)
outfits_df["outfit_tags"] = outfits_df["outfit_tags"].apply(eval)

In [2]:
# Add original orders to user triplets
original_orders_df = pd.read_csv(ORIGINAL_ORDERS_CSV_PATH, sep=CSV_SEPARATOR)
user_triplets_df = pd.concat([user_triplets_df, original_orders_df], ignore_index=True)

In [3]:
from src.prepare_train_test_splits import translate_user_triplets_to_orders, remove_consecutive_duplicates

# Convert triplets into entries for each individual user
user_triplets_df = remove_consecutive_duplicates(user_triplets_df)
user_orders_df = translate_user_triplets_to_orders(user_triplets_df, outfits_df)

4949


In [4]:
import numpy as np
from src.prepare_train_test_splits import convert_user_orders_to_train_test_splits

# Split the data into train and test sets, with one dataframe with no restirictions on outfits in the test data and one that prohibits repeated outfits
# It prints any cases in which it is unable to construct a test set with unique outfits.
user_splits_df, user_splits_unique_df = convert_user_orders_to_train_test_splits(user_orders_df, percentage_test=0.3)

No unique outfit found with groups ['group.4bd4ee24eac8948e82783b15d9404f6b'
 'group.4bd4ee24eac8948e82783b15d9404f6b']
No unique outfit found with groups ['group.423a23f6717e6d85adac54c051ee9832'
 'group.423a23f6717e6d85adac54c051ee9832']
No unique outfit found with groups ['group.e0cb0f6e113edc4df8a1e304376734f6'
 'group.e0cb0f6e113edc4df8a1e304376734f6']
No unique outfit found with groups ['group.384b8170c6a6ddfd568ff7fab5fb49c4'
 'group.384b8170c6a6ddfd568ff7fab5fb49c4']
No unique outfit found with groups ['group.edb60c2f440a9ac7d0883fb9371c8607'
 'group.edb60c2f440a9ac7d0883fb9371c8607']
No unique outfit found with groups ['group.a3ab26b5d2f7ef2cf102422a3dde3b46'
 'group.a3ab26b5d2f7ef2cf102422a3dde3b46']
No unique outfit found with groups ['group.2c7095c075561fe6278f3a2d7c1d6ac9'
 'group.2c7095c075561fe6278f3a2d7c1d6ac9']
No unique outfit found with groups ['group.ae8da3f0ad6f8ff3f83b2af96e975991'
 'group.ae8da3f0ad6f8ff3f83b2af96e975991']
No unique outfit found with groups ['gro

In [5]:
def check_if_train_is_in_test(train_outfit_ids, test_outfit_ids):
    contaminated = np.isin(train_outfit_ids, test_outfit_ids).any()
    if contaminated:
        print(", ".join(train_outfit_ids) + "||" + ", ".join(test_outfit_ids))
    return contaminated

# Ensure the separation between unique outfits is valid. This should return 0.
user_splits_unique_df.apply(lambda x: check_if_train_is_in_test(x["train_outfit_ids"], x["test_outfit_id"]), axis=1).sum()

0

In [6]:
import src.rs_methods
# The maximum number of items to recommend
NUM_ITEMS = 100

# The below code represents the four baseline methods discussed.

# Most popular outfits prediction
def predict_most_popular(user_splits_df, user_splits_unique_df):
    most_popular_train_outfit_ids, most_popular_train_groups = src.rs_methods.get_most_popular_outfits(user_splits_df, NUM_ITEMS)
    user_splits_df["id_prediction"] = [most_popular_train_outfit_ids] * len(user_splits_df)
    user_splits_df["group_prediction"] = [most_popular_train_groups] * len(user_splits_df)
    most_popular_train_outfit_ids, most_popular_train_groups = src.rs_methods.get_most_popular_outfits(user_splits_unique_df, NUM_ITEMS)
    user_splits_unique_df["id_prediction"] = [most_popular_train_outfit_ids] * len(user_splits_unique_df)
    user_splits_unique_df["group_prediction"] = [most_popular_train_groups] * len(user_splits_unique_df)
    return user_splits_df, user_splits_unique_df

# Previous rental prediction
def predict_previous_rental(user_splits_df, user_splits_unique_df):
    user_splits_df["id_prediction"] = user_splits_df["train_outfit_ids"].apply(lambda x: x[-NUM_ITEMS:])
    user_splits_df["group_prediction"] = user_splits_df["train_group"].apply(lambda x: x[-NUM_ITEMS:])
    user_splits_unique_df["id_prediction"] = user_splits_unique_df["train_outfit_ids"].apply(lambda x: x if len(x) <= NUM_ITEMS else x[-NUM_ITEMS:])
    user_splits_unique_df["group_prediction"] = user_splits_unique_df["train_group"].apply(lambda x: x if len(x) <= NUM_ITEMS else x[-NUM_ITEMS:])
    return user_splits_df, user_splits_unique_df

# Previous rental + most popular outfits prediction
def predict_rental_and_most_popular(user_splits_df, user_splits_unique_df):
    def pad_with_most_popular(x, pop_outfits):
        if len(x) < NUM_ITEMS:
            return np.append(x, pop_outfits[:NUM_ITEMS - len(x)])
        else:
            return x[-NUM_ITEMS:]

    most_popular_train_outfit_ids, most_popular_train_groups = src.rs_methods.get_most_popular_outfits(user_splits_df, NUM_ITEMS)
    user_splits_df["id_prediction"] = user_splits_df.apply(lambda x: pad_with_most_popular(x["train_outfit_ids"], most_popular_train_outfit_ids), axis=1)
    user_splits_df["group_prediction"] = user_splits_df.apply(lambda x: pad_with_most_popular(x["train_group"], most_popular_train_groups), axis=1)
    user_splits_unique_df["id_prediction"] = user_splits_unique_df.apply(lambda x: pad_with_most_popular(x["train_outfit_ids"], most_popular_train_outfit_ids), axis=1)
    user_splits_unique_df["group_prediction"] = user_splits_unique_df.apply(lambda x: pad_with_most_popular(x["train_group"], most_popular_train_groups), axis=1)
    return user_splits_df, user_splits_unique_df

# Random prediction
def predict_random_outfit(user_splits_df, user_splits_unique_df):
    def get_random_outfits(x):
        return np.random.choice(all_outfit_ids, NUM_ITEMS, replace=False)
    all_outfit_ids = outfits_df["id"].values
    all_groups = outfits_df["group"].values
    user_splits_df["id_prediction"] = user_splits_df.apply(lambda x: get_random_outfits(x), axis=1)
    user_splits_df["group_prediction"] = user_splits_df.apply(lambda x: np.random.choice(all_groups, NUM_ITEMS, replace=False), axis=1)
    user_splits_unique_df["id_prediction"] = user_splits_unique_df.apply(lambda x: get_random_outfits(x), axis=1)
    user_splits_unique_df["group_prediction"] = user_splits_unique_df.apply(lambda x: np.random.choice(all_groups, NUM_ITEMS, replace=False), axis=1)

    return user_splits_df, user_splits_unique_df


# Choose the method to use here
METHOD = "Rep"
if METHOD == "Pop":
    user_splits_df, user_splits_unique_df = predict_most_popular(user_splits_df, user_splits_unique_df)
elif METHOD == "Rep":
    user_splits_df, user_splits_unique_df = predict_previous_rental(user_splits_df, user_splits_unique_df)
elif METHOD == "Rep + Pop":
    user_splits_df, user_splits_unique_df = predict_rental_and_most_popular(user_splits_df, user_splits_unique_df)
elif METHOD == "Rand":
    user_splits_df, user_splits_unique_df = predict_random_outfit(user_splits_df, user_splits_unique_df)


In [7]:
from IPython.display import display

# Evaluate the hit rate at n for a single user
def evaluate_hit_rate_at_n(test_id, predicted_ids, n=10):
    if predicted_ids is np.nan:
        print(f"None prediction for {test_id}!")
        return 0
    predicted_ids = predicted_ids[:n]
    if type(test_id) == str or type(test_id) == np.str_:
        if test_id in predicted_ids:
            return 1
    elif type(test_id) == list or type(test_id) == np.ndarray:
        for outfit_id in test_id:
            if outfit_id in predicted_ids:
                return 1
    else:
        raise ValueError(f"Unknown type {type(test_id)}")
    return 0

# Evaluate the hit rate at n for all dataframes
def evaluate_df_hit_rate_at_n(df, n=10):
    HIT_RATE_COLUMNS = ["id_hit_rate_at_100", "id_hit_rate_at_10", "group_hit_rate_at_100", "group_hit_rate_at_10"]
    df["id_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["id_prediction"], n=100), axis=1)
    df["id_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["id_prediction"], n=10), axis=1)
    df["group_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_prediction"], n=100), axis=1)
    df["group_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_prediction"], n=10), axis=1)
    display(df[HIT_RATE_COLUMNS].mean())
    result_dict = {column: df[column].mean() for column in HIT_RATE_COLUMNS}
    return df, result_dict

print(f"Baseline evaluation for method: {METHOD}")
user_splits_df, all_dict = evaluate_df_hit_rate_at_n(user_splits_df, n=10)
user_splits_unique_df, ind_dict = evaluate_df_hit_rate_at_n(user_splits_unique_df, n=10)

Baseline evaluation for method: Rep


id_hit_rate_at_100       0.125512
id_hit_rate_at_10        0.054559
group_hit_rate_at_100    0.153176
group_hit_rate_at_10     0.075564
dtype: float64

id_hit_rate_at_100       0.0
id_hit_rate_at_10        0.0
group_hit_rate_at_100    0.0
group_hit_rate_at_10     0.0
dtype: float64

In [8]:
import pyperclip

# A small function to format the results into the format of the latex table in the article.
def format_dicts_into_latex(all_dict, ind_dict, precision=4, run_name="Random"):
    first_row = f"{run_name} Ind & {all_dict['id_hit_rate_at_10']:.{precision}f} & {all_dict['id_hit_rate_at_100']:.{precision}f} & {ind_dict['id_hit_rate_at_10']:.{precision}f} & {ind_dict['id_hit_rate_at_100']:.{precision}f} \\\\"
    second_row = f"{run_name} Groups & {all_dict['group_hit_rate_at_10']:.{precision}f} & {all_dict['group_hit_rate_at_100']:.{precision}f} & {ind_dict['group_hit_rate_at_10']:.{precision}f} & {ind_dict['group_hit_rate_at_100']:.{precision}f} \\\\\\hline"
    full_string = first_row + "\n" + second_row + "\n"
    print(full_string)
    pyperclip.copy(full_string)

format_dicts_into_latex(all_dict, ind_dict, precision=4, run_name=METHOD)

Rep Ind & 0.0546 & 0.1255 & 0.0000 & 0.0000 \\
Rep Groups & 0.0756 & 0.1532 & 0.0000 & 0.0000 \\\hline

