In [1]:
import time

notebook_start_time = time.time()

# Set up environment

In [2]:
import sys
from pathlib import Path


def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False


def clone_repository() -> None:
    !git clone https://github.com/decodingml/hands-on-recommender-system.git
    %cd hands-on-recommender-system/


def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml


if is_google_colab():
    clone_repository()
    install_dependencies()

    root_dir = str(Path().absolute())
    print("⛳️ Google Colab environment")
else:
    root_dir = str(Path().absolute().parent)
    print("⛳️ Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    print(f"Adding the following directory to the PYTHONPATH: {root_dir}")
    sys.path.append(root_dir)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[2mnvidia-cuda-cupti-cu12[0m [32m----------------[2m--------------[0m[0m 6.65 MiB/13.17 MiB
[2mnvidia-nvjitlink-cu12[0m [32m----------[2m--------------------[0m[0m 6.65 MiB/20.09 MiB
[2mnvidia-cuda-nvrtc-cu12[0m [32m---------[2m---------------------[0m[0m 6.59 MiB/23.50 MiB
[2mnvidia-curand-cu12[0m [32m----[2m--------------------------[0m[0m 6.63 MiB/53.70 MiB
[2mcatboost  [0m [32m---[2m---------------------------[0m[0m 6.52 MiB/93.98 MiB
[2mnvidia-cusolver-cu12[0m [32m--[2m----------------------------[0m[0m 6.63 MiB/122.01 MiB
[2mnvidia-cusparse-cu12[0m [32m-[2m-----------------------------[0m[0m 6.47 MiB/197.84 MiB
[2mnvidia-cufft-cu12[0m [32m-[2m-----------------------------[0m[0m 6.65 MiB/201.66 MiB
[2mnvidia-cublas-cu12[0m [32m-[2m-----------------------------[0m[0m 6.83 MiB/346.60 MiB
[2mtensorflow[0m [32m-[2m-----------------------------[0m[0m 6.85 MiB/467

# 🧬 Training pipeline: Training ranking model </span>

In this notebook, you will train a ranking model using gradient boosted trees.

## 📝 Imports

In [3]:
%load_ext autoreload
%autoreload 2

import warnings

warnings.filterwarnings("ignore")

from loguru import logger

from recsys import hopsworks_integration, training
from recsys.config import settings

## Constants

In [4]:
from pprint import pprint

pprint(dict(settings))

{'CUSTOMER_DATA_SIZE': <CustomerDatasetSize.SMALL: 'SMALL'>,
 'CUSTOM_HOPSWORKS_INFERENCE_ENV': 'custom_env_name',
 'FEATURES_EMBEDDING_MODEL_ID': 'all-MiniLM-L6-v2',
 'HOPSWORKS_API_KEY': None,
 'OPENAI_API_KEY': None,
 'OPENAI_MODEL_ID': 'gpt-4o-mini',
 'RANKING_DATASET_VALIDATON_SPLIT_SIZE': 0.1,
 'RANKING_EARLY_STOPPING_ROUNDS': 5,
 'RANKING_ITERATIONS': 100,
 'RANKING_LEARNING_RATE': 0.2,
 'RANKING_MODEL_TYPE': 'ranking',
 'RANKING_SCALE_POS_WEIGHT': 10,
 'RECSYS_DIR': PosixPath('/content/hands-on-recommender-system/recsys'),
 'TWO_TOWER_DATASET_TEST_SPLIT_SIZE': 0.1,
 'TWO_TOWER_DATASET_VALIDATON_SPLIT_SIZE': 0.1,
 'TWO_TOWER_LEARNING_RATE': 0.01,
 'TWO_TOWER_MODEL_BATCH_SIZE': 2048,
 'TWO_TOWER_MODEL_EMBEDDING_SIZE': 16,
 'TWO_TOWER_NUM_EPOCHS': 10,
 'TWO_TOWER_WEIGHT_DECAY': 0.001}


## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [5]:
project, fs = hopsworks_integration.get_feature_store()

[32m2025-02-19 03:48:24.332[0m | [1mINFO    [0m | [36mrecsys.hopsworks_integration.feature_store[0m:[36mget_feature_store[0m:[36m18[0m - [1mLogin to Hopsworks using cached API key.[0m


Copy your Api Key (first register/login): https://c.app.hopsworks.ai/account/api/generated

Paste it here: ··········

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1213603


# Getting the training data

In [6]:
feature_view_ranking = hopsworks_integration.feature_store.create_ranking_feature_views(
    fs
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1213603/fs/1201233/fv/customers/version/1
Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1213603/fs/1201233/fv/articles/version/1
Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1213603/fs/1201233/fv/ranking/version/1


In [7]:
X_train, X_val, y_train, y_val = feature_view_ranking.train_test_split(
    test_size=settings.RANKING_DATASET_VALIDATON_SPLIT_SIZE,
    description="Ranking training dataset",
)
X_train.head(3)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (3.23s) 




Unnamed: 0,age,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,garment_group_name,month_sin,month_cos
0,20.0,Earring,Accessories,Solid,Light Orange,Light,Orange,Jewellery,Ladies Accessories,Ladieswear,Womens Small accessories,Accessories,0.866025,-0.5
1,33.0,Vest top,Garment Upper body,Solid,Beige,Dusty Light,Mole,Jersey fancy,Ladieswear,Ladieswear,Womens Everyday Collection,Jersey Fancy,1.0,6.123234000000001e-17
2,21.0,Swimwear bottom,Swimwear,Solid,Black,Dark,Black,Swimwear,Lingeries/Tights,Ladieswear,"Womens Swimwear, beachwear",Swimwear,0.866025,0.5


In [8]:
y_train.head(3)

Unnamed: 0,label
0,1
1,1
2,1


# Training the ranking model

Let's train the ranking model:

In [9]:
model = training.ranking.RankingModelFactory.build()
trainer = training.ranking.RankingModelTrainer(
    model=model, train_dataset=(X_train, y_train), eval_dataset=(X_val, y_val)
)

In [10]:
trainer.fit()

0:	learn: 0.5147134	test: 0.5147105	best: 0.5147105 (0)	total: 199ms	remaining: 19.7s
1:	learn: 0.3950144	test: 0.3950168	best: 0.3950168 (1)	total: 470ms	remaining: 23s
2:	learn: 0.3091766	test: 0.3091824	best: 0.3091824 (2)	total: 618ms	remaining: 20s
3:	learn: 0.2453475	test: 0.2453590	best: 0.2453590 (3)	total: 696ms	remaining: 16.7s
4:	learn: 0.1967673	test: 0.1967835	best: 0.1967835 (4)	total: 833ms	remaining: 15.8s
5:	learn: 0.1593520	test: 0.1593549	best: 0.1593549 (5)	total: 1.04s	remaining: 16.4s
6:	learn: 0.1298889	test: 0.1299036	best: 0.1299036 (6)	total: 1.1s	remaining: 14.6s
7:	learn: 0.1066422	test: 0.1066697	best: 0.1066697 (7)	total: 1.18s	remaining: 13.5s
8:	learn: 0.0881874	test: 0.0882238	best: 0.0882238 (8)	total: 1.29s	remaining: 13s
9:	learn: 0.0734695	test: 0.0735152	best: 0.0735152 (9)	total: 1.38s	remaining: 12.4s
10:	learn: 0.0616903	test: 0.0617446	best: 0.0617446 (10)	total: 1.46s	remaining: 11.8s
11:	learn: 0.0522553	test: 0.0523187	best: 0.0523187 (11)	t

<catboost.core.CatBoostClassifier at 0x7e54f6dbcb90>

## Evaluating the ranking model

Next, you'll evaluate how well the model performs on the validation data using metrics for classification such as precision, recall and f1-score:

In [11]:
metrics = trainer.evaluate(log=True)

[32m2025-02-19 03:51:20.506[0m | [1mINFO    [0m | [36mrecsys.training.ranking[0m:[36mevaluate[0m:[36m62[0m - [1m              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19956
           1       0.95      1.00      0.98      1880

    accuracy                           1.00     21836
   macro avg       0.98      1.00      0.99     21836
weighted avg       1.00      1.00      1.00     21836
[0m


It can be seen that the model has a low F1-score on the positive class (higher is better). The performance could potentially be improved by adding more features to the dataset, e.g. image embeddings.

Let's see which features your model considers important.

In [12]:
trainer.get_feature_importance()

{'month_sin': 51.539266504897896,
 'month_cos': 43.21749296318156,
 'garment_group_name': 1.1095949444799105,
 'perceived_colour_value_name': 0.6041100432455446,
 'index_group_name': 0.4979197280855095,
 'age': 0.43146957755498183,
 'product_group_name': 0.4073585738515272,
 'perceived_colour_master_name': 0.36842097261377904,
 'product_type_name': 0.36565976038783965,
 'graphical_appearance_name': 0.34505076416994007,
 'index_name': 0.31488686940205607,
 'section_name': 0.30102426212384303,
 'department_name': 0.2665573830759739,
 'colour_group_name': 0.23118765292959678}

## <span style="color:#ff5f27">  Uploading the model to Hopsworks model registry </span>

In [13]:
mr = project.get_model_registry()

In [14]:
ranking_module = hopsworks_integration.ranking_serving.HopsworksRankingModel(
    model=model
)
ranking_module.register(mr, feature_view_ranking, X_train, metrics)

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/617223 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/433 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1213603/models/ranking_model/1


## <span style="color:#ff5f27"> Inspecting the model in the Hopsworks model registry </span>

View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Data Science → Model Registry**

---

In [15]:
notebook_end_time = time.time()
notebook_execution_time = notebook_end_time - notebook_start_time

logger.info(
    f"⌛️ Notebook Execution time: {notebook_execution_time:.2f} seconds ~ {notebook_execution_time / 60:.2f} minutes"
)

[32m2025-02-19 03:51:53.429[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m4[0m - [1m⌛️ Notebook Execution time: 329.52 seconds ~ 5.49 minutes[0m


# <span style="color:#ff5f27">→ Next Steps </span>

In the next notebook, you will compute embeddings for all the items, populate a vector index with them (as a feature group) and create an online feature view which will allow you to retrieve candidates, for each user, with very low latency.