In [1]:
import time

notebook_start_time = time.time()

# Set up environment

In [2]:
import sys
from pathlib import Path


def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False


def clone_repository() -> None:
    !git clone https://github.com/decodingml/hands-on-recommender-system.git
    %cd hands-on-recommender-system/


def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml


if is_google_colab():
    clone_repository()
    install_dependencies()

    root_dir = str(Path().absolute())
    print("⛳️ Google Colab environment")
else:
    root_dir = str(Path().absolute().parent)
    print("⛳️ Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    print(f"Adding the following directory to the PYTHONPATH: {root_dir}")
    sys.path.append(root_dir)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[2mcatboost  [0m [32m--[2m----------------------------[0m[0m 4.00 MiB/93.98 MiB
[2mnvidia-cusolver-cu12[0m [32m--[2m----------------------------[0m[0m 6.90 MiB/122.01 MiB
[2mnvidia-cusparse-cu12[0m [32m--[2m----------------------------[0m[0m 6.60 MiB/197.84 MiB
[2mnvidia-cufft-cu12[0m [32m--[2m----------------------------[0m[0m 6.92 MiB/201.66 MiB
[2mnvidia-cublas-cu12[0m [32m-[2m-----------------------------[0m[0m 6.70 MiB/346.60 MiB
[2mtensorflow[0m [32m-[2m-----------------------------[0m[0m 6.87 MiB/467.18 MiB
[2K[16A[37m⠼[0m [2mPreparing packages...[0m (45/63)
[2mlangchain [0m [32m--------------------[2m----------[0m[0m 623.89 KiB/952.60 KiB
[2mjedi      [0m [32m---------------[2m---------------[0m[0m 720.00 KiB/1.50 MiB
[2mstreamlit [0m [32m-----------------------[2m-------[0m[0m 5.92 MiB/8.04 MiB
[2mpandas    [0m [32m------------------[2m------------[

# 👩🏻‍🔬 Offline inference pipeline: Computing item embeddings

In this notebook you will compute the candidate embeddings and populate a Hopsworks feature group with a vector index.

## 📝 Imports

In [3]:
import warnings

warnings.filterwarnings("ignore")

from loguru import logger

from recsys import features, hopsworks_integration
from recsys.config import settings

## Constants

In [4]:
from pprint import pprint

pprint(dict(settings))

{'CUSTOMER_DATA_SIZE': <CustomerDatasetSize.SMALL: 'SMALL'>,
 'CUSTOM_HOPSWORKS_INFERENCE_ENV': 'custom_env_name',
 'FEATURES_EMBEDDING_MODEL_ID': 'all-MiniLM-L6-v2',
 'HOPSWORKS_API_KEY': None,
 'OPENAI_API_KEY': None,
 'OPENAI_MODEL_ID': 'gpt-4o-mini',
 'RANKING_DATASET_VALIDATON_SPLIT_SIZE': 0.1,
 'RANKING_EARLY_STOPPING_ROUNDS': 5,
 'RANKING_ITERATIONS': 100,
 'RANKING_LEARNING_RATE': 0.2,
 'RANKING_MODEL_TYPE': 'ranking',
 'RANKING_SCALE_POS_WEIGHT': 10,
 'RECSYS_DIR': PosixPath('/content/hands-on-recommender-system/recsys'),
 'TWO_TOWER_DATASET_TEST_SPLIT_SIZE': 0.1,
 'TWO_TOWER_DATASET_VALIDATON_SPLIT_SIZE': 0.1,
 'TWO_TOWER_LEARNING_RATE': 0.01,
 'TWO_TOWER_MODEL_BATCH_SIZE': 2048,
 'TWO_TOWER_MODEL_EMBEDDING_SIZE': 16,
 'TWO_TOWER_NUM_EPOCHS': 10,
 'TWO_TOWER_WEIGHT_DECAY': 0.001}


## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [5]:
project, fs = hopsworks_integration.get_feature_store()

mr = project.get_model_registry()

[32m2025-02-19 03:55:26.858[0m | [1mINFO    [0m | [36mrecsys.hopsworks_integration.feature_store[0m:[36mget_feature_store[0m:[36m18[0m - [1mLogin to Hopsworks using cached API key.[0m


Copy your Api Key (first register/login): https://c.app.hopsworks.ai/account/api/generated

Paste it here: ··········

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1213603


# Computing candidate embeddings

You start by computing candidate embeddings for all items in the training data.

First, you load your candidate model. Recall that you uploaded it to the Hopsworks Model Registry in previous steps:

In [6]:
candidate_model, candidate_features = (
    hopsworks_integration.two_tower_serving.HopsworksCandidateModel.download(mr=mr)
)

[32m2025-02-19 03:55:44.526[0m | [1mINFO    [0m | [36mrecsys.hopsworks_integration.two_tower_serving[0m:[36mdownload[0m:[36m185[0m - [1mDownloading 'candidate_model' version 1[0m




### Get candidates data

Now, we get the training retrieval data containing all the features required for the candidate embedding model.

In [7]:
feature_view = fs.get_feature_view(
    name="retrieval",
    version=1,
)

In [8]:
train_df, val_df, test_df, _, _, _ = feature_view.train_validation_test_split(
    validation_size=settings.TWO_TOWER_DATASET_VALIDATON_SPLIT_SIZE,
    test_size=settings.TWO_TOWER_DATASET_TEST_SPLIT_SIZE,
    description="Retrieval dataset splits",
)
train_df.head(3)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (7.32s) 




Unnamed: 0,customer_id,article_id,t_dat,price,month_sin,month_cos,age,club_member_status,age_group,garment_group_name,index_group_name
0,f7048acb8188d98bde3a5c495475a3c86faafe0eede1f2...,670265002,0,0.013542,-0.8660254,0.5,48.0,ACTIVE,46-55,"Under-, Nightwear",Ladieswear
1,5d34f84e6cbe9ec4706872bb65376097af1e53f0c7dac5...,751471035,0,0.033881,1.224647e-16,-1.0,30.0,ACTIVE,26-35,Trousers,Ladieswear
2,baf6dc7ea8575732794751bb80824fe84fd40e6af86193...,719308002,0,0.059305,0.5,-0.866025,48.0,ACTIVE,46-55,Dresses Ladies,Divided


### Compute embeddings

Next you compute the embeddings of all candidate items that were used to train the retrieval model.

In [9]:
item_df = features.embeddings.preprocess(train_df, candidate_features)
item_df.head(3)

Unnamed: 0,garment_group_name,article_id,index_group_name
0,"Under-, Nightwear",670265002,Ladieswear
1,Trousers,751471035,Ladieswear
2,Dresses Ladies,719308002,Divided


In [10]:
embeddings_df = features.embeddings.embed(df=item_df, candidate_model=candidate_model)
embeddings_df.head()

Unnamed: 0,article_id,embeddings
0,670265002,"[-0.7604751586914062, 0.9238709807395935, -1.0..."
1,751471035,"[1.6406099796295166, 1.3932268619537354, 0.028..."
2,719308002,"[0.7188986539840698, 0.34544265270233154, -0.5..."
3,759231002,"[1.7318897247314453, 0.47544997930526733, -0.9..."
4,733027006,"[1.732801079750061, 0.2029293179512024, -0.809..."


# <span style="color:#ff5f27">Create Hopsworks Embedding Index </span>

Now you are ready to create a feature group for your candidate embeddings.

To begin with, you need to create your Embedding Index where you will specify the name of the embeddings feature and the embeddings length.
Then you attach this index to the FG.

In [11]:
candidate_embeddings_fg = (
    hopsworks_integration.feature_store.create_candidate_embeddings_feature_group(
        fs=fs, df=embeddings_df, online_enabled=True
    )
)
logger.info("✅ Uploaded 'candidate_embeddings' Feature Group to Hopsworks!!")

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1213603/fs/1201233/fg/1401277


Uploading Dataframe: 100.00% |██████████| Rows 11824/11824 | Elapsed Time: 00:03 | Remaining Time: 00:00


Launching job: candidate_embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1213603/jobs/named/candidate_embeddings_1_offline_fg_materialization/executions


[32m2025-02-19 03:58:49.970[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m6[0m - [1m✅ Uploaded 'candidate_embeddings' Feature Group to Hopsworks!![0m


## Expose it to the online inference pipeline as a Feature View


In [12]:
feature_view = (
    hopsworks_integration.feature_store.create_candidate_embeddings_feature_view(
        fs=fs, fg=candidate_embeddings_fg
    )
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1213603/fs/1201233/fv/candidate_embeddings/version/1


## <span style="color:#ff5f27"> Inspecting the embeddings in Hopsworks UI </span>

View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Feature Store → Feature Groups**

---

In [13]:
notebook_end_time = time.time()
notebook_execution_time = notebook_end_time - notebook_start_time

logger.info(
    f"⌛️ Notebook Execution time: {notebook_execution_time:.2f} seconds ~ {notebook_execution_time / 60:.2f} minutes"
)

[32m2025-02-19 03:59:37.091[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m4[0m - [1m⌛️ Notebook Execution time: 357.80 seconds ~ 5.96 minutes[0m


# <span style="color:#ff5f27">→ Next Steps </span>

Now that we have our vector index populated with item embeddings, everything is ready for production. In the next notebook, we will zoom in into the inference pipeline and how we can deploy it to Hopsworks as a real-time deployment.