This notebook creates a very simple model for generating outfits. This is sligth better than a random model. It computes the most popular category combinations in the outfits dataset and sort the candidate items, by this category combination pupularity.

This script takes less than 5 minutes to run

# Train - Computation of the category combinations

In [4]:
import datetime
print(datetime.datetime.now())

2022-06-17 12:06:36.619108


In [5]:
import pandas as pd

full_outfits = pd.read_parquet("../data/manual_outfits_train_baseline_polyvore.parquet")
full_outfits.head()

Unnamed: 0,products,outfit_id
0,"[194508109, 188778349, 188977857, 194942557, 1...",214181831
1,"[108042640, 103135174, 107307153, 107349261, 1...",120161271
2,"[119554820, 124815566, 125624471, 120960356, 1...",143656996
3,"[16045248, 142491416, 103846762, 119583482, 45...",186627934
4,"[178357420, 135914227, 180944840, 152714225, 1...",206969379


In [6]:
# items_metadata = pd.read_parquet("../data/products.parquet")
items_metadata = pd.read_parquet("../data/products_polyvore.parquet")
items_metadata = items_metadata[["product_id", "product_category"]]
items_metadata.set_index("product_id", inplace=True)
items_metadata.head()

Unnamed: 0_level_0,product_category
product_id,Unnamed: 1_level_1
194508109,Sweatshirts
188778349,Jackets
188977857,Jeans
194942557,Ankle Booties
194941874,Backpacks


In [7]:
outfits_category = full_outfits.explode(column="products")
outfits_category = outfits_category.merge(
    items_metadata,
    left_on="products",
    right_on=items_metadata.index,
    how="inner"
)
outfits_category.rename(columns={"product_category": "categories"}, inplace=True)
outfits_category.head()

Unnamed: 0,products,outfit_id,categories
0,194508109,214181831,Sweatshirts
1,188778349,214181831,Jackets
2,188778349,214181831,Jackets
3,188778349,214181831,Jackets
4,188778349,213791668,Jackets


In [8]:
outfits_category = outfits_category.groupby("outfit_id").agg({"categories": list, "products": list})
outfits_category.reset_index(inplace=True)

outfits_category["categories_sorted"] = outfits_category.apply(lambda row: frozenset(row["categories"]), axis=1)
outfits_category.head()

Unnamed: 0,outfit_id,categories,products,categories_sorted
0,100050716,"[Rompers, Sandals, Backpacks, Cardigans, Cardi...","[82237110, 112606978, 97657274, 93047733, 9304...","(Rompers, Cardigans, Sandals, Backpacks, Food ..."
1,100079699,"[Pumps, Pumps, Pumps, Pumps, Pumps, Day Dresse...","[93928435, 93928435, 93928435, 93928435, 93928...","(Face Makeup, Day Dresses, Mascara, Makeup, Ey..."
2,100095342,"[Sunglasses, Sunglasses, Sunglasses, Vests, Ve...","[92665975, 92665975, 92665975, 94182425, 94182...","(Handbags, Blouses, Sunglasses, Ankle Booties,..."
3,100099673,"[Necklaces, Backpacks, Sweatshirts, Tank Tops,...","[75110602, 90050865, 77436376, 83333043, 89072...","(Sneakers, Skinny Jeans, Beauty Products, Neck..."
4,100119147,"[Blazers, Blazers, Shorts, Pumps, Pumps, Tote ...","[89186204, 89186204, 93122027, 86376642, 86376...","(Shorts, Necklaces, Blazers, Sunglasses, Tote ..."


In [9]:
from collections import Counter, defaultdict

categories_sets = Counter(outfits_category.categories_sorted.values)
toy_model = {
    "items_metadata": items_metadata,
    "categories_sets": categories_sets
}

In [10]:
import itertools
print(print(dict(itertools.islice(categories_sets.items(), 2)) ))

{frozenset({'Rompers', 'Cardigans', 'Sandals', 'Backpacks', 'Food & Drink'}): 1, frozenset({'Face Makeup', 'Day Dresses', 'Mascara', 'Makeup', 'Eyeliner', 'Nail Polish', 'Fragrance', 'Pumps'}): 1}
None


In [11]:
candidates = test_outfits.loc[11, "candidates"]
for candidate in candidates:
    outfit = test_outfits.loc[11, "incomplete_outfit"] + [candidate]
    categories = toy_model["items_metadata"].loc[outfit]["product_category"].tolist()
    print(categories)
    score = toy_model["categories_sets"].get(frozenset(categories), 0)
    print(score)

KeyError: 11

# Predict - Using the category sets to rank the candidates

### Use the file name generatad by evaluation/simple_split_dataset.ipynb here

In [11]:
incomplete_outfits_input = "../data/manual_outfits_testinput_baseline_polyvore.parquet"

In [12]:
test_outfits = pd.read_parquet(incomplete_outfits_input)
test_outfits.head()

Unnamed: 0,outfit_id,incomplete_outfit,candidates
0,119704139,"[102972440, 91303250, 94989504, 103184729]","[156949162, 96522232, 103394173, 127110314]"
1,119314458,"[63368648, 56280997, 100591740, 102258488, 634...","[170124970, 133377267, 101246478, 88978228]"
2,147361785,"[123958925, 120599452, 128671112, 68150400, 12...","[98636735, 126378351, 73351321, 167168953]"
3,148015877,"[105237904, 99169836, 101587279]","[111251852, 141698953, 132717390, 50975359]"
4,192637078,"[163752297, 164436214, 163468278, 164737861, 1...","[190327671, 160481299, 113650934, 194114289]"


In [13]:
def predict(model, incomplete_outfit, candidates):
    """
    This is the core of your model. In our example, we are going to use the toy model we build in the first part 
    of this notebook, but you are free to create your amazing model and use it here.
    Arguments:
        model: the outfits model
        incomplete_outfit: a list of product_id containing the outfit we want to complete
        candidates: a list of product_id that you are tasked to select the rigth product to complete the outfit
    Return
        A list of product_id sorted according your model
    """
    scores = []
    for candidate in candidates:
        outfit = incomplete_outfit + [candidate]
        try:
            categories = model["items_metadata"].loc[outfit]["product_category"].tolist()
            score = model["categories_sets"].get(frozenset(categories), 0)
        except KeyError:
            score = 0
        scores.append((score, candidate))
    return [
        candidate
        for _, candidate in sorted(scores, reverse=True)
    ]

test_outfits["predicted_products"] = test_outfits.apply(lambda row: predict(toy_model, row["incomplete_outfit"], row["candidates"]), axis=1)
test_outfits["predicted_product"] = test_outfits.apply(lambda row: row["predicted_products"][0], axis=1)
test_outfits.head()

Unnamed: 0,outfit_id,incomplete_outfit,candidates,predicted_products,predicted_product
0,119704139,"[102972440, 91303250, 94989504, 103184729]","[156949162, 96522232, 103394173, 127110314]","[103394173, 156949162, 127110314, 96522232]",103394173
1,119314458,"[63368648, 56280997, 100591740, 102258488, 634...","[170124970, 133377267, 101246478, 88978228]","[170124970, 133377267, 101246478, 88978228]",170124970
2,147361785,"[123958925, 120599452, 128671112, 68150400, 12...","[98636735, 126378351, 73351321, 167168953]","[167168953, 126378351, 98636735, 73351321]",167168953
3,148015877,"[105237904, 99169836, 101587279]","[111251852, 141698953, 132717390, 50975359]","[141698953, 132717390, 111251852, 50975359]",141698953
4,192637078,"[163752297, 164436214, 163468278, 164737861, 1...","[190327671, 160481299, 113650934, 194114289]","[194114289, 190327671, 160481299, 113650934]",194114289


In [14]:
output_name = incomplete_outfits_input.replace(".parquet", "_predictions.csv")
output_columns = ["outfit_id", "predicted_product"]
test_outfits[output_columns].to_csv(output_name, header=True, index=False)

In [12]:
print(datetime.datetime.now())

2022-06-13 15:33:16.304723


In [13]:
print(full_outfits.loc[3, "products"])
print(test_outfits.loc[3, "incomplete_outfit"])
print(list(set(full_outfits.loc[3, "products"]) - set(test_outfits.loc[3, "incomplete_outfit"])))
if 17052573 in test_outfits.loc[3, "predicted_products"]:
    print("Predicted the right product at place ", test_outfits.loc[3, "predicted_products"].index(17052573))
else:
    print("Product not predicted :/")

[16127776, 16756133, 17040752, 18203427, 18205465]
[17040752, 16127776, 16756133, 18203427]
[18205465]
Product not predicted :/
