## <span style="color:#ff5f27">📝 Imports </span>

In [None]:
!pip install -U kaleido # For Plotly Image export

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import os
import joblib
from features.price import plot_prediction_test
from functions import predict_id
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [None]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store() 

In [None]:
# Retrieve feature groups
averages_fg = fs.get_feature_group(
    name='averages',
    version=1,
)

prices_fg = fs.get_feature_group(
    name='prices',
    version=1,
)

## <span style="color:#ff5f27">🔪 Feature Selection </span>

In [None]:
# Select features for training data
selected_features = prices_fg.select_all() \
    .join(averages_fg.select_features())

# Uncomment this if you would like to view your selected features
# selected_features.show(5)

## <span style="color:#ff5f27">🤖 Transformation Functions </span>

In [None]:
# Load transformation function
min_max_scaler = fs.get_transformation_function(name="min_max_scaler")

# Define a list of feature names
feature_names = [
    'ma_7', 'ma_14', 'ma_30', 'daily_rate_of_change', 'volatility_30_day', 'ema_02', 'ema_05', 'rsi'
]

# Map features to transformations
transformation_functions = [
    min_max_scaler(feature_name)
    for feature_name in feature_names
]
transformation_functions

## <span style="color:#ff5f27">⚙️ Feature View Creation </span>

In [None]:
# Get or create the 'price_fv' feature view
feature_view = fs.get_or_create_feature_view(
    name='price_fv',
    version=1,
    query=selected_features,
    labels=["price"],
    transformation_functions=transformation_functions,
)

## <span style="color:#ff5f27">🏋️ Training Dataset Creation </span>

In [None]:
# Get training and testing sets
X_train, X_test, y_train, y_test = feature_view.train_test_split(
    description='Prices Dataset',                    # Provide a description for the dataset split
    train_start='2024-01-01',                        # Start date for the training set
    train_end='2024-08-31',                          # End date for the training set
    test_start='2024-09-01',                         # Start date for the testing set
    test_end=datetime.today().strftime("%Y-%m-%d"),  # End date for the testing set (current date)
)

In [None]:
X_train.head(3)

In [None]:
y_train.head(3)

In [None]:
# Sort the training features by the 'date' column
X_train = X_train.sort_values("date")

# Reindex the target 'y_train' to match the sorted order of 'X_train'
y_train = y_train.reindex(X_train.index)

# Sort the testing features by the 'date' column
X_test = X_test.sort_values("date")

# Reindex the target 'y_test' to match the sorted order of 'X_test'
y_test = y_test.reindex(X_test.index)

# Extract and store the 'date' column as a separate DataFrame for both training and testing sets
train_date = pd.DataFrame(X_train.pop("date"))
test_date = pd.DataFrame(X_test.pop("date"))

## <span style="color:#ff5f27">🧬 Modeling </span>

We will use the XGBoost Regressor. XGBoost regressor is a powerful and highly effective machine learning algorithm for regression problems. XGBoost is known for its ability to handle complex relationships in the data, handle missing values, and provide accurate predictions. It's a popular choice in the data science community due to its robustness and excellent predictive performance, making it well-suited for our specific problem.

In [None]:
# Initialize the XGBoost regressor
model = xgb.XGBRegressor()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the validation set
y_test_pred = model.predict(X_test)

# Calculate RMSE on the validation set
mse = mean_squared_error(y_test, y_test_pred, squared=False)
print(f"🎯 Mean Squared Error (MSE): {mse}")

In [None]:
# Make predictions for a specific ID (ID=1) using the 'predict_id' function
prediction_for_id = predict_id(
    1, 
    X_test, 
    model,
)

# Generate a Plotly figure for visualizing the predictions
fig = plot_prediction_test(
    1, 
    X_train, 
    X_test, 
    y_train, 
    y_test, 
    train_date, 
    test_date, 
    prediction_for_id,
)

# Display the generated Plotly figure
fig.show()

## <span style="color:#ff5f27">⚙️ Model Schema </span>

In [None]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

# Create an input schema using the training features
input_schema = Schema(X_train.values)

# Create an output schema using the target variable
output_schema = Schema(y_train)

# Create a model schema using the input and output schemas
model_schema = ModelSchema(
    input_schema=input_schema, 
    output_schema=output_schema,
)

# Convert the model schema to a dictionary
model_schema.to_dict()

## <span style="color:#ff5f27">📝 Register model </span>

In [None]:
# Specify the directory for saving the model
model_dir = "price_model"

# Check if the directory exists, and create it if not
if not os.path.isdir(model_dir):
    os.mkdir(model_dir)

# Save the trained XGBoost model using joblib
joblib.dump(model, f'{model_dir}/xgboost_price_model.pkl')

# Check if the images directory exists, and create it if not
images_dir = f"{model_dir}/images"
if not os.path.isdir(images_dir):
    os.mkdir(images_dir)

# Write the generated Plotly figure image to the specified directory
fig.write_image(f'{images_dir}/model_prediction.png')

In [None]:
# Get the model registry from the project
mr = project.get_model_registry()

# Create a Python model in the model registry named 'xgboost_price_model'
price_model = mr.python.create_model(
    name="xgboost_price_model", 
    metrics={"MSE": mse},           # Specify metrics (Mean Squared Error)
    model_schema=model_schema,      # Provide the model schema
    input_example=X_train.sample(), # Provide an example of the input data
    description="Price Predictor",  # Add a description for the model
    feature_view=feature_view,      # Add a feature view to the model
)

# Save the model to the specified directory
price_model.save(model_dir)

## <span style="color:#ff5f27">🚀 Model Deployment</span>

**About Model Serving**

Models can be served via KFServing or "default" serving, which means a Docker container exposing a Flask server. For KFServing models, or models written in Tensorflow, you do not need to write a prediction file (see the section below). However, for sklearn models using default serving, you do need to proceed to write a prediction file.

In order to use KFServing, you must have Kubernetes installed and enabled on your cluster.

## <span style="color:#ff5f27">📎 Predictor script for Python models</span>

Scikit-learn and XGBoost models are deployed as Python models, in which case you need to provide a Predict class that implements the predict method. The `predict()` method invokes the model on the inputs and returns the prediction as a list.

The `init()` method is run when the predictor is loaded into memory, loading the model from the local directory it is materialized to, ARTIFACT_FILES_PATH.

The directive **"%%writefile"** writes out the cell before to the given Python file. We will use the **predict_example.py** file to create a deployment for our model.

In [None]:
%%writefile predict_example.py
import os
import numpy as np
import pandas as pd
import hopsworks
import joblib


class Predict(object):

    def __init__(self):
        """ Initializes the serving state, reads a trained model"""        
        # Get feature store handle
        project = hopsworks.login()
        mr = project.get_model_registry()
        
        # Retrieve the feature view from the model
        retrieved_model = mr.get_model(
            name="xgboost_price_model",
            version=1,
        )
        self.feature_view = retrieved_model.get_feature_view()

        # Load the trained model
        self.model = joblib.load(os.environ["ARTIFACT_FILES_PATH"] + "/xgboost_price_model.pkl")
        print("✅ Initialization Complete")

    
    def predict(self, id_value):
        """ Serves a prediction request usign a trained model"""
        # Retrieve feature vectors
        feature_vector = self.feature_view.get_feature_vector(
            entry = {'id': id_value[0][0]}
        )
        return self.model.predict(np.asarray(feature_vector[1:]).reshape(1, -1)).tolist()

This script needs to be put into a known location in the Hopsworks file system. Let's call the file predict_example.py and put it in the Models directory.

In [None]:
# Get the dataset API from the project
dataset_api = project.get_dataset_api()

# Upload the file "predict_example.py" to the "Models" dataset, overwriting if it already exists
uploaded_file_path = dataset_api.upload("predict_example.py", "Models", overwrite=True)

# Create the full path to the uploaded predictor script
predictor_script_path = os.path.join("/Projects", project.name, uploaded_file_path)

---

## <span style="color:#ff5f27">🚀 Create the deployment</span>

Here, you fetch the model you want from the model registry and define a configuration for the deployment. For the configuration, you need to specify the serving type (default or KFserving).

In [None]:
# Deploy the 'price_model'
deployment = price_model.deploy(
    name="priceonlinemodeldeployment",  # Specify the deployment name
    script_file=predictor_script_path,  # Provide the path to the predictor script
)

In [None]:
# Start the deployment and wait for it up to 360 seconds
deployment.start(await_running=360)

In [None]:
# Get the current state of the deployment and describe it
deployment_state = deployment.get_state().describe()

In [None]:
# Predict price for the 1 ID
deployment.predict(inputs=[[1]])

---