## <span style='color:#ff5f27'> 📝 Imports

In [1]:
import joblib
import os
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

import xgboost as xgb
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

# Mute warnings
import warnings
warnings.filterwarnings("ignore")

## <span style="color:#ff5f27;"> 📡 Connecting to Hopsworks Feature Store </span>

In [2]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()

2025-06-26 11:38:12,135 INFO: Initializing external client
2025-06-26 11:38:12,135 INFO: Base URL: https://10.87.42.15:28181
2025-06-26 11:38:12,830 INFO: Python Engine initialized.

Logged in to project, explore it here https://10.87.42.15:28181/p/119


---

## <span style="color:#ff5f27;"> 🔪 Feature Selection </span>

In [3]:
# Retrieve feature groups.
trans_fg = fs.get_feature_group(
    name='transactions_fraud_online_fg', 
    version=1,
)
profile_online_fg = fs.get_feature_group(
    name='profile_fraud_online_fg', 
    version=1,
)

# Select features for training dataset
selected_features = (trans_fg.select(['amount', 'country', 'fraud_label', 'loc_delta_t_plus_1', 'loc_delta_t_minus_1', 'time_delta_t_minus_1'])
                            .join(profile_online_fg.select_features()))

2025-06-26 11:38:14,120 INFO: Using ['gender'] from feature group `profile_fraud_online_fg` as features for the query. To include primary key and event time use `select_all`.


In [4]:
# Uncomment this if you would like to view your selected features
# selected_features.show(5)

---

### <span style="color:#ff5f27;"> 🤖 Transformation Functions </span>

Manually creating `label_encoder` as a python UDF to get optimal performance during online inference. 

Python UDFs are faster than Pandas UDFs for smaller volumes of data since it aviods the overhead of creating pandas DataFrames/Series. However Python UDFs are slower when compared with Pandas UDFs for larger volumnes of Data. 

In [5]:
from hsfs.transformation_statistics import TransformationStatistics

@hopsworks.udf(int, drop=["feature"], mode="python")
def label_encoder(feature, statistics=TransformationStatistics("feature")) -> pd.Series:
    unique_data = sorted([value for value in statistics.feature.unique_values])
    value_to_index = {value: index for index, value in enumerate(unique_data)}
    # Unknown categories not present in training dataset are encoded as -1.
    return value_to_index.get(feature, -1) if feature is not None else None

transformation_functions = [
    label_encoder("country"),
    label_encoder("gender"),
]

In [6]:
# Map features to transformation functions.
transformation_functions = [
    label_encoder("country"),
    label_encoder("gender"),
]

## <span style="color:#ff5f27;"> ⚙️ Feature View Creation </span>

In [7]:
# Get or create the 'transactions_fraud_online_fv' feature view
feature_view = fs.get_or_create_feature_view(
    name='transactions_fraud_online_fv',
    version=1,
    query=selected_features,
    labels=["fraud_label"],
    transformation_functions=transformation_functions,
)

Feature view created successfully, explore it at 
https://10.87.42.15:28181/p/119/fs/67/fv/transactions_fraud_online_fv/version/1


## <span style="color:#ff5f27;"> 🏋️ Training Dataset </span>

In [8]:
# Training/Test splits, datasets creation. Using timerange arguments.
train_start = "2022/01/01"
train_end = "2022/03/10"
test_start = "2022/03/10"
test_end = "2022/03/31"

td_version, _ = feature_view.create_train_test_split(
    train_start=train_start,
    train_end=train_end,
    test_start=test_start,
    test_end=test_end,
)

X_train, X_test, y_train, y_test = feature_view.get_train_test_split(td_version)

Training dataset job started successfully, you can follow the progress at 
https://10.87.42.15:28181/p/119/jobs/named/transactions_fraud_online_fv_1_create_fv_td_26062025093821/executions
2025-06-26 11:38:30,120 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-06-26 11:38:33,220 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-06-26 11:40:15,912 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-06-26 11:40:16,044 INFO: Waiting for log aggregation to finish.
2025-06-26 11:40:24,520 INFO: Execution finished successfully.

2025-06-26 11:40:28,389 INFO: Provenance cached data - overwriting last accessed/created training dataset from 1 to 1.


In [9]:
# Display the normalized value counts of the training labels (y_train)
y_train.value_counts(normalize=True)

fraud_label
0              0.996458
1              0.003542
Name: proportion, dtype: float64

---

## <span style="color:#ff5f27;"> 🧬 Modeling</span>

In [10]:
# Initialize an XGBoost classifier
model = xgb.XGBClassifier()

# Train the classifier using the training features (X_train) and labels (y_train)
model.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [11]:
# Predict the training set
y_pred_train = model.predict(X_train)

# Predict the test set
y_pred_test = model.predict(X_test)

In [12]:
# Compute f1 score
metrics = {
    "f1_score": f1_score(y_test, y_pred_test, average='macro')
}
metrics

{'f1_score': 1.0}

In [13]:
# Calculate the confusion matrix for the test set predictions
results = confusion_matrix(
    y_test, 
    y_pred_test, 
    labels=[False, True],
)

# Print the confusion matrix
print(results)

[[40229     0]
 [    0     0]]


---

## <span style="color:#ff5f27;">📝 Register model</span>

In [14]:
# Specify the model directory
model_dir = "fraud_online_model"
images_dir = os.path.join(model_dir, "images")

# Create directories if they don't exist
os.makedirs(images_dir, exist_ok=True)

In [15]:
# Save the trained XGBoost model
joblib.dump(model, os.path.join(model_dir, "xgboost_fraud_online_model.pkl"))

['fraud_online_model/xgboost_fraud_online_model.pkl']

In [16]:
# Create a DataFrame from the confusion matrix results
df_cm = pd.DataFrame(
    results, 
    ['True Normal', 'True Fraud'],
    ['Pred Normal', 'Pred Fraud']
)

# Create and save the confusion matrix heatmap
plt.figure(figsize=(8, 6))
cm = sns.heatmap(
    df_cm, 
    annot=True,
    fmt='d',                 # Use integer format for numbers
    cmap='RdPu',             # Use a color palette that works well for binary classification
    annot_kws={'size': 12},  # Increase annotation text size
    cbar=True                # Include color bar
)

# Add title and labels
plt.title('Confusion Matrix for Fraud Detection')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

# Adjust layout and save
plt.tight_layout()
plt.savefig(os.path.join(images_dir, "confusion_matrix.png"), dpi=300, bbox_inches='tight')
plt.close()

In [17]:
# Get the model registry
mr = project.get_model_registry()

# Create a Python model in the model registry
fraud_model = mr.python.create_model(
    name="xgboost_fraud_online_model", 
    description="Fraud Online Predictor", # Add a description for the model
    metrics=metrics,                      # Specify the metrics used to evaluate the model
    input_example=[4467360740682089],     # Example input for testing deployments
    feature_view=feature_view,            # Add a feature view to the model
    training_dataset_version=td_version   # Add the version of the training dataset used to train the model
)

# Save the model to the specified model directory
fraud_model.save(model_dir)

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /Users/manu/Desktop/HopsWorks/bechmarking_hopsworks/fraud_online_model/xgboost_fraud_online_model.pk…

Uploading /Users/manu/Desktop/HopsWorks/bechmarking_hopsworks/fraud_online_model/images/confusion_matrix.png: …

Uploading /Users/manu/Desktop/HopsWorks/bechmarking_hopsworks/input_example.json: 0.000%|          | 0/18 elap…

Uploading /Users/manu/Desktop/HopsWorks/bechmarking_hopsworks/model_schema.json: 0.000%|          | 0/664 elap…

Model created, explore it at https://10.87.42.15:28181/p/119/models/xgboost_fraud_online_model/1


Model(name: 'xgboost_fraud_online_model', version: 1)

---

## <a class="anchor" id="1.5_bullet" style="color:#ff5f27"> 🚀 Model Deployment</a>

### <span style="color:#ff5f27;">📎 Predictor script for Python models</span>

The predictor script to use the [online feature store REST APIs](https://docs.hopsworks.ai/latest/user_guides/fs/feature_view/feature-server/) and an aysnc `predict` function to allow higher IO-Bound parallelism.

The `predict` function uses [aiohttp](https://docs.aiohttp.org/en/stable/) to asynchronously retrive the feature vectors from the online feature store rest APIs and it then uses the feature view to transform the retrived feature vectors using model dependent tranformations attached to the feature view. The transformed feature vector is then passed to the trained XgBoost Model. 

The concurrency used by the XgBoost model is limited since it is a similar model and we do not want it to created contention between threads and cause a bottleneck.


> **NOTE:**
>
>The predictor requires a secret called `HOPSWORKS_API_KEY` which contains the Hopsworks API key that you have generated. You can create secrets in Hopsworks by following the documentation here : https://docs.hopsworks.ai/latest/user_guides/projects/secrets/create_secret/ 

In [46]:
%%writefile predict_rdrs_async.py

# XgBoost Models release pythons GIL to make use of mutlile threads - https://xgboosting.com/xgboost-releases-gil-during-inference-prediction/
# Here we set the number of threads to 1 used by openMP to one, so to reduce the overhead of context switching between threads.
import os
os.environ["OMP_NUM_THREADS"] = "1"
import numpy as np
import hopsworks
import joblib
import aiohttp
import hopsworks
from hopsworks.hsfs.core import variable_api

class Predict(object):

    def __init__(self, model):
        """ Initializes the serving state, reads a trained model"""        

        # login to hopsworks to use the secresta and variables api
        self.project = hopsworks.login()
        secrets_api = hopsworks.get_secrets_api()

        # Fetching host name for Online feature store rest API
        var_api = variable_api.VariableApi()
        self.online_store_rest_endpoint = var_api.get_loadbalancer_external_domain("online_store_rest_server")
        
        # Fetching API key and creating header required for Online feature store rest API
        self.headers = {'X-API-KEY': secrets_api.get("HOPSWORKS_API_KEY")}

        # Fetching and initializing the feature view for serving with the correct training dataset version.
        self.feature_view = model.get_feature_view()

        # Lloading the mpdel
        self.model = joblib.load(os.environ["MODEL_FILES_PATH"] + "/xgboost_fraud_online_model.pkl")

        # The aiohttp session is lazy initialized since an asyncio event loop is not created when the model is initalized.
        self._session = None

    async def predict(self, inputs):
        """ Serves a prediction request usign a trained model"""

        data = {
            "featureStoreName": self.project.name,
            "featureViewName": self.feature_view.name,
            "featureViewVersion": self.feature_view.version,
            "entries": inputs,
            "metadataOptions": {
                "featureName": True
            }
        }

        async with self.session.post(f"https://{self.online_store_rest_endpoint}:4406/0.1.0/batch_feature_store", 
                                     headers=self.headers, 
                                     json=data, 
                                     ssl=False) as rsp:
            rsp_data = await rsp.json()

        feature_vector = rsp_data["features"]

        transfomed_feature_vectors = self.feature_view.transform(feature_vector)

        prediction = self.model.predict(transfomed_feature_vectors).tolist() # Numpy Arrays are not JSON serializable

        return prediction

    @property
    def session(self):
        if not self._session:
            self._session = aiohttp.ClientSession()
        return self._session

Overwriting predict_rdrs_async.py


In [41]:
# Get the dataset API for the current project
dataset_api = project.get_dataset_api()

# Specify the local file path of the Python script to be uploaded
local_script_path_async_rdrs = "predict_rdrs_async.py"

# Upload the Python script to the "Models", and overwrite if it already exists\
uploaded_file_path_async_rdrs = dataset_api.upload(local_script_path_async_rdrs, "Models", overwrite=True)

# Create the full path to the uploaded script for future reference
predictor_script_path_async_rdrs = os.path.join("/Projects", project.name, uploaded_file_path_async_rdrs)

Uploading /Users/manu/Desktop/HopsWorks/bechmarking_hopsworks/predict_rdrs_async.py: 0.000%|          | 0/2733…

### Create the environment required for the deployment

In [42]:
# Upload the requirements to hopsworks file system
ds_api = project.get_dataset_api()
requirements_path = ds_api.upload("requirements-deployment.txt", "Resources", overwrite=True)

Uploading /Users/manu/Desktop/HopsWorks/bechmarking_hopsworks/requirements-deployment.txt: 0.000%|          | …

In [43]:
# Create a new environment
env_api = project.get_environment_api()

deployment_env = env_api.get_environment("pandas-inference-pipeline-aio-hhtp")

if not deployment_env:
    deployment_env = env_api.create_environment("pandas-inference-pipeline-aio-hhtp", base_environment_name="pandas-inference-pipeline")
    # Insall the environment
    deployment_env.install_requirements(requirements_path)

### Create the deployment

The deployment uses 30 instances with 1 CPU each. This is the minimal requirement to hit a benchmark of 5000 RPS with batchs sizes of 25 for this deployment. We scale horizontaly instead of vertically for the deployment to avoid issues with the [Python Global Interpreter Lock](https://wiki.python.org/moin/GlobalInterpreterLock)

In [44]:
# Deploy the fraud model
deployment_async_rdrs = fraud_model.deploy(
    name="deploymentasyncrdrs",  # Specify a name for the deployment
    script_file=predictor_script_path_async_rdrs,  # Provide the path to the Python script for prediction
    resources={'num_instances': 1, 'requests': {'cores': 1}, 'limits': {'cores': 1}},
    environment=deployment_env.name
)

Deployment created, explore it at https://10.87.42.15:28181/p/119/deployments/2054
Before making predictions, start the deployment by using `.start()`


In [45]:
deployment_async_rdrs.start()

  0%|          | 0/5 [00:00<?, ?it/s]

Start making predictions by using `.predict()`
