In this notebook we will train a ranking model.

Since our dataset consists of positive user-item interactions (transactions) we need to do negative sampling.

This notebook can be run to generate both training and validation data. Please run the the notebook once, change `USE_TRAIN` below to False, and run the notebook again if you want to generate both datasets.

In [1]:
import pandas as pd

USE_TRAIN = True

if USE_TRAIN:
    train_df = pd.read_csv("train_df.csv", dtype={"item_id" : object})
    ds_name = "ranking_train_df.csv"
else:
    # Use validation data.
    train_df = pd.read_csv("val_df.csv", dtype={"item_id" : object})
    ds_name = "ranking_val_df.csv"

# TODO change name from train_df to df in the following parts of the notebook.

These are the true positive pairs.

In [2]:
query_features = ["user_id", "age", "month_sin", "month_cos"]

positive_pairs = train_df[query_features + ["item_id"]].copy()

positive_pairs

Unnamed: 0,user_id,age,month_sin,month_cos,item_id
0,f42fd54b1a2cce6427622789e29290f8a8261b105b3b78...,27.0,-0.500000,-8.660254e-01,716348004
1,a773d5822fe0c466d271c61b5c001f5114f25b09476c08...,53.0,0.500000,-8.660254e-01,776237026
2,914412f8f47bb1115f83fd3088ce81b993a8602b22c098...,39.0,0.500000,-8.660254e-01,697054003
3,122b81c195da29a23b046880eae54f811fe05635977ea1...,43.0,-0.500000,8.660254e-01,806201003
4,2fbaaca49a30a6ca73d0c527c6821bb320c8aa995fd02a...,50.0,0.500000,-8.660254e-01,855834001
...,...,...,...,...,...
288063,d3c2c620f81ee5e90a1ecf3102999b62948ce0f0e7ade7...,31.0,1.000000,6.123234e-17,681657001
288064,2f608c8291a9c8fb746a86b6c84bb075e828e8264d7592...,32.0,0.000000,1.000000e+00,688873005
288065,e86653d219a641c2374f062273ce6522a9bf7d99696824...,53.0,0.866025,-5.000000e-01,700079003
288066,a4fe1ea2cf6a62a3d2c1f9907b4a5ed5ed471702353e3f...,56.0,-1.000000,-1.836970e-16,399223029


In [3]:
n_neg = len(positive_pairs)*10

negative_pairs = positive_pairs[query_features]\
    .sample(n_neg, replace=True, random_state=1)\
    .reset_index(drop=True)

negative_pairs["item_id"] = positive_pairs["item_id"]\
    .sample(n_neg, replace=True, random_state=2).to_numpy()

negative_pairs

Unnamed: 0,user_id,age,month_sin,month_cos,item_id
0,a73751f517eb392e9c78e9f4d80ef2a43036178e680f1e...,56.0,-1.000000,-1.836970e-16,703805001
1,379c6218457b6f2ca882ae5067508c3ce2cfebce19d969...,29.0,-0.866025,5.000000e-01,372860001
2,681395bce63b2d55bc60d94f2f05b878307a8573956778...,27.0,0.500000,-8.660254e-01,664074001
3,f9053c073933816275a5136bebf3abeeb5afff1b1f9de7...,26.0,-0.500000,8.660254e-01,870290001
4,fc8910d42f9e453d9058559fca7012f3bae029103d1e8b...,28.0,0.866025,-5.000000e-01,733027002
...,...,...,...,...,...
2880675,81c0ef18bc4229055b3df58197bacd95957ad6ea1d41f5...,33.0,-1.000000,-1.836970e-16,801512004
2880676,74a213c5d0d8154f896c19e497248eb9fe8c453a53e196...,52.0,0.500000,-8.660254e-01,740930004
2880677,e8fb9a07db3fbc4023a45c7fecf167f0851a4e335a4446...,26.0,0.000000,1.000000e+00,786089001
2880678,00cc3990759e3937d57f9e8b2e27b655c04ff02827282d...,55.0,1.000000,6.123234e-17,779546005


In [4]:
# Add labels.
positive_pairs["label"] = 1
negative_pairs["label"] = 0

# Concatenate.
ranking_train_df = pd.concat(
    [positive_pairs, negative_pairs],
    ignore_index=True)


In [5]:
ranking_train_df

Unnamed: 0,user_id,age,month_sin,month_cos,item_id,label
0,f42fd54b1a2cce6427622789e29290f8a8261b105b3b78...,27.0,-0.5,-8.660254e-01,716348004,1
1,a773d5822fe0c466d271c61b5c001f5114f25b09476c08...,53.0,0.5,-8.660254e-01,776237026,1
2,914412f8f47bb1115f83fd3088ce81b993a8602b22c098...,39.0,0.5,-8.660254e-01,697054003,1
3,122b81c195da29a23b046880eae54f811fe05635977ea1...,43.0,-0.5,8.660254e-01,806201003,1
4,2fbaaca49a30a6ca73d0c527c6821bb320c8aa995fd02a...,50.0,0.5,-8.660254e-01,855834001,1
...,...,...,...,...,...,...
3168743,81c0ef18bc4229055b3df58197bacd95957ad6ea1d41f5...,33.0,-1.0,-1.836970e-16,801512004,0
3168744,74a213c5d0d8154f896c19e497248eb9fe8c453a53e196...,52.0,0.5,-8.660254e-01,740930004,0
3168745,e8fb9a07db3fbc4023a45c7fecf167f0851a4e335a4446...,26.0,0.0,1.000000e+00,786089001,0
3168746,00cc3990759e3937d57f9e8b2e27b655c04ff02827282d...,55.0,1.0,6.123234e-17,779546005,0


In [6]:
# Merge with item features.

candidate_features = ["item_id", "garment_group_name", "index_group_name"]

item_df = train_df[candidate_features].copy()
item_df.drop_duplicates(subset="item_id", inplace=True)

ranking_train_df = ranking_train_df.merge(item_df, on="item_id")

ranking_train_df.sample(5)

Unnamed: 0,user_id,age,month_sin,month_cos,item_id,label,garment_group_name,index_group_name
668612,9fbb249e0562148344ae799dad9d154ee5589f65f39e45...,46.0,-1.0,-1.83697e-16,559601002,0,Swimwear,Ladieswear
1764104,5d3220f7cf5966d76423f04f028a67e9408b7dbc9ba2e3...,26.0,0.5,-0.8660254,697985001,0,Knitwear,Ladieswear
2753074,3dfe9d03179412f25c050e825ee8455c1415ad75c9318c...,53.0,-0.5,-0.8660254,869663003,0,Dresses Ladies,Ladieswear
444509,cdfd5d24a3c6a8d81c60607b1ff2d85cf35c194d22f133...,30.0,0.5,0.8660254,684209004,0,Swimwear,Ladieswear
1902075,641cdae218ba5431bfaed29526127c79fcec1731b2ce8f...,59.0,0.866025,-0.5,733181001,0,Trousers,Ladieswear


In [7]:
import tensorflow as tf

# Load models.
item_model = tf.keras.models.load_model("item_model")
user_model = tf.keras.models.load_model("user_model")

2022-05-19 15:10:03.009566: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.




In [8]:
ranking_train_ds = tf.data.Dataset.from_tensor_slices(
    {col: ranking_train_df[col] for col in ranking_train_df})


In [9]:
import numpy as np

item_feat = ["item_id", "garment_group_name", "index_group_name"]

# TODO this code is ugly.
# item and user models are a bit inconsistent for some reason...
# TODO can we get names of inputs automatically from the models?
def compute_dist(x):
    x.pop("label")
    item_embeddings = item_model({i : x[i] for i in item_feat})
    user_embeddings = user_model(x)
    return tf.keras.losses.cosine_similarity(item_embeddings, user_embeddings)

ranking_train_df["cos"] = np.concatenate([res.numpy() for res in ranking_train_ds.batch(2048).map(compute_dist)])

ranking_train_df.sample(5)

In [12]:
ranking_train_df.to_csv(ds_name, index=False)