## Create Ranking Dataset

In this notebook, we'll create a dataset for our ranking model. Since our dataset only consists of positive user-item interactions (transactions) we need to do negative sampling. (Otherwise our model might just recommend all items to all users.)

This notebook can be run to generate both training and validation data. Please run the the notebook once, change `USE_TRAIN` below to False, and run the notebook again if you want to generate both datasets.

In [None]:
# Uncomment this cell and fill in details if you are running external Python
import os
key=""
with open("api-key.txt", "r") as f:
    key = f.read().rstrip()
os.environ['HOPSWORKS_PROJECT']="hm"
os.environ['HOPSWORKS_HOST']="35.240.81.237"
os.environ['HOPSWORKS_API_KEY']=key

In [None]:
import hopsworks

project = hopsworks.login()
fs = project.get_feature_store()

In [1]:

USE_TRAIN = True


# Load training dataset.
td = fs.get_training_dataset("retrieval_fv_1")

split = "train" if USE_TRAIN else "validation"
ds_name = f"ranking_{split}.csv"
    
df = td.read(split)

df['article_id'] = df['article_id'].astype(str)

These are the true positive pairs.

In [2]:
query_features = ["customer_id", "age", "month_sin", "month_cos"]

positive_pairs = df[query_features + ["article_id"]].copy()

positive_pairs

Unnamed: 0,customer_id,age,month_sin,month_cos,article_id
0,41f34be4afe6bdcd162b5f224a7de5afd00ea603bb9eb3...,28.0,0.866025,-5.000000e-01,825182002
1,2c75e62fb31d0a34c2b98cf258fcb07dc7ea4814175122...,41.0,0.000000,1.000000e+00,536139006
2,24c05afdbc8452193e1fbe9880f4f211362cf1bafd98bf...,25.0,-0.500000,-8.660254e-01,571706001
3,2d0c8329e11897a04e500a1f732935f0cdc7c05e503633...,49.0,-0.866025,-5.000000e-01,667491002
4,6e660ecbdb728c0348d5f8cd4d7e55dab0bdce68b0fbc3...,20.0,-0.866025,-5.000000e-01,904734001
...,...,...,...,...,...
175648,3887ee49b2369608b71c044a33fb76f7c365661e1988aa...,20.0,0.500000,8.660254e-01,784247002
175649,72bab485b89f06b728fe9d16d64755a651c45a8b148358...,54.0,1.000000,6.123234e-17,814766002
175650,fa71b8e314a8376b20b3fc4ef9a3da7860f48102ddf3bc...,34.0,-1.000000,-1.836970e-16,745250003
175651,4d71a3900a8a679e0b7c78f95eae29c582d16ccfbdc433...,50.0,1.000000,6.123234e-17,708345008


In [3]:
n_neg = len(positive_pairs)*10

negative_pairs = positive_pairs[query_features]\
    .sample(n_neg, replace=True, random_state=1)\
    .reset_index(drop=True)

negative_pairs["article_id"] = positive_pairs["article_id"]\
    .sample(n_neg, replace=True, random_state=2).to_numpy()

negative_pairs

Unnamed: 0,customer_id,age,month_sin,month_cos,article_id
0,d70566b9ffb797606676b64889ad9751a1d43d02f2c8dc...,26.0,0.500000,-8.660254e-01,702623004
1,76d56add4c1f8a64d87c014f662dc2e51734ff8adbebab...,32.0,1.000000,6.123234e-17,851607001
2,1ac72e15eb24902c1c2616138f2b13c7c45e209565046c...,58.0,-0.866025,-5.000000e-01,788256002
3,818b73de70a3324684ab1f4556b1a1d2d5338d6d7273c5...,23.0,0.500000,-8.660254e-01,762096004
4,5d2cd47a2871976a66ec3918033508c2c3dd050d1bce82...,52.0,-0.500000,-8.660254e-01,699454003
...,...,...,...,...,...
1756525,f2a28deedd75806a5428eb9278e8d226e0d9a0f22e2fcf...,22.0,-0.866025,5.000000e-01,814655001
1756526,b098f6ad370892001bdd0630ab5fc35e0c3c15f64a7ac1...,41.0,-0.866025,-5.000000e-01,848681005
1756527,9a4e2095318ff888d9233bd64684f67db1ace8de2148ac...,27.0,-0.866025,5.000000e-01,712159001
1756528,bcf31687544e1dd46edae710e10f80af27d5f53b12c1da...,38.0,0.000000,1.000000e+00,696390001


In [4]:
import pandas as pd

# Add labels.
positive_pairs["label"] = 1
negative_pairs["label"] = 0

# Concatenate.
ranking_df = pd.concat([positive_pairs, negative_pairs], ignore_index=True)

In [5]:
ranking_df

Unnamed: 0,customer_id,age,month_sin,month_cos,article_id,label
0,41f34be4afe6bdcd162b5f224a7de5afd00ea603bb9eb3...,28.0,0.866025,-0.500000,825182002,1
1,2c75e62fb31d0a34c2b98cf258fcb07dc7ea4814175122...,41.0,0.000000,1.000000,536139006,1
2,24c05afdbc8452193e1fbe9880f4f211362cf1bafd98bf...,25.0,-0.500000,-0.866025,571706001,1
3,2d0c8329e11897a04e500a1f732935f0cdc7c05e503633...,49.0,-0.866025,-0.500000,667491002,1
4,6e660ecbdb728c0348d5f8cd4d7e55dab0bdce68b0fbc3...,20.0,-0.866025,-0.500000,904734001,1
...,...,...,...,...,...,...
1932178,f2a28deedd75806a5428eb9278e8d226e0d9a0f22e2fcf...,22.0,-0.866025,0.500000,814655001,0
1932179,b098f6ad370892001bdd0630ab5fc35e0c3c15f64a7ac1...,41.0,-0.866025,-0.500000,848681005,0
1932180,9a4e2095318ff888d9233bd64684f67db1ace8de2148ac...,27.0,-0.866025,0.500000,712159001,0
1932181,bcf31687544e1dd46edae710e10f80af27d5f53b12c1da...,38.0,0.000000,1.000000,696390001,0


In [6]:
# Merge with item features.
articles_fg = fs.get_feature_group("articles")
item_df = articles_fg.read()
item_df.drop_duplicates(subset="article_id", inplace=True)

ranking_df = ranking_df.merge(item_df, on="article_id")

In [None]:
import hsml

conn = hsml.connection()
mr = conn.get_model_registry()

candidate_model = mr.get_model("candidate_model")
candidate_model_path = candidate_model.download()

query_model = mr.get_model("query_model")
query_model_path = query_model.download()

In [7]:
import tensorflow as tf

# Load models.
item_model = tf.keras.models.load_model(candidate_model_path)
user_model = tf.keras.models.load_model(query_model_path)

2022-05-25 16:16:22.512658: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.




Next, we compute the query and candidate embeddings.

In [9]:
import numpy as np

# Retrieve input feature names.
candidate_model_schema = candidate_model.model_schema['input_schema']['columnar_schema']
item_features = [feat['name'] for feat in candidate_model_schema]
query_model_schema = query_model.model_schema['input_schema']['columnar_schema']
query_features = [feat['name'] for feat in query_model_schema]

def df_to_ds(df):
    return tf.data.Dataset.from_tensor_slices({col : df[col] for col in df})

item_ds = df_to_ds(ranking_df[item_features])
query_ds = df_to_ds(ranking_df[query_features])

item_emb_ds = item_ds.batch(2048).map(item_model)
user_emb_ds = query_ds.batch(2048).map(user_model)

item_emb_arr = np.concatenate([batch.numpy() for batch in item_emb_ds])
user_emb_arr = np.concatenate([batch.numpy() for batch in user_emb_ds])

item_emb_df = pd.DataFrame(item_emb_arr).add_prefix("item_emb_")
user_emb_df = pd.DataFrame(user_emb_arr).add_prefix("user_emb_")

ranking_df = pd.concat([ranking_df, item_emb_df, user_emb_df], axis=1)

There are several "duplicated" categorical features in the dataset. For instance, `index_code` and `index_name` encodes the same feature, but in different formats (int, string). Therefore we have to deduplicate these features.

In [10]:
def exclude_feat(s):
    return s.endswith("_id") or s.endswith("_no") or s.endswith("_code")

features_to_exclude = [col for col in ranking_df.columns if exclude_feat(col)]
features_to_exclude.append("prod_name")

ranking_df.drop(features_to_exclude, axis="columns", inplace=True)

ranking_df.head()

Unnamed: 0,age,month_sin,month_cos,label,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,...,user_emb_6,user_emb_7,user_emb_8,user_emb_9,user_emb_10,user_emb_11,user_emb_12,user_emb_13,user_emb_14,user_emb_15
0,28.0,0.866025,-0.5,1,Dress,Garment Full body,Solid,Beige,Medium Dusty,Beige,...,0.656987,-0.932891,-0.569415,-0.453538,-0.882349,-0.115333,-0.254943,0.470428,-0.317947,-0.160978
1,26.0,-0.5,-0.866025,0,Dress,Garment Full body,Solid,Beige,Medium Dusty,Beige,...,1.998003,-0.032723,1.009787,-0.178237,-0.572892,-1.146646,0.622664,0.219331,-0.700367,0.981967
2,31.0,0.5,-0.866025,0,Dress,Garment Full body,Solid,Beige,Medium Dusty,Beige,...,1.334583,-1.036122,0.476515,-0.527729,-1.734965,-0.576535,-0.083657,-0.066746,-0.45998,0.937554
3,18.0,0.866025,0.5,0,Dress,Garment Full body,Solid,Beige,Medium Dusty,Beige,...,0.572441,-1.04228,-0.768781,-0.623541,-0.437548,-0.146008,0.044053,-0.145812,-0.549785,-1.008755
4,56.0,0.5,0.866025,0,Dress,Garment Full body,Solid,Beige,Medium Dusty,Beige,...,1.573153,-0.441142,1.140333,-1.988783,-0.899474,-1.043525,-1.258397,1.144067,1.382314,-0.062828


In [11]:
ranking_df.to_csv(ds_name, index=False)

In [None]:
import hopsworks

connection = hopsworks.connection()
project = connection.get_project()
dataset_api = project.get_dataset_api()
uploaded_file_path = dataset_api.upload(ds_name, "Resources", overwrite=True)

### Next Steps

In the next notebook, we'll train a ranking model on the dataset we created in this notebook.