# <span style="font-width:bold; font-size: 3rem; color:#1EB182;"> **Hopsworks Feature Store** </span>
<span style="font-width:bold; font-size: 2rem; color:#333;">Part 02 - Training Pipeline: Load, Train & Deploy</span>

## <span style="color:#ff5f27;">📝 Imports</span>

In [67]:
import ast
import numpy as np
import pandas as pd
import hopsworks
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from ast import literal_eval
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import hsfs
import os
import joblib
from graphdatascience import GraphDataScience
from keys import *

## <span style="color:#ff5f27;">📡 Connecting to Hopsworks Feature Store</span>

In [62]:
# Login to Hopsworks
'''
project = hopsworks.login()
'''

project = hsfs.connection(
    host='staging.cloud.hopsworks.ai',                 # DNS of your Feature Store instance
    port=443,                           # Port to reach your Hopsworks instance, defaults to 443
    project='neo4j_tutorial_new',               # Name of your Hopsworks Feature Store project
    api_key_value='OzvDK7qnG4VlhPxX.3wzZfJWWFmuYegomtWDRXAAvvGhlJEPkg4bn9kSbQaWPDgC3akbtERQOojLtEl5r',            # The API key to authenticate with the feature store
    hostname_verification=True          # Disable for self-signed certificates
)

# Retrieve Feature Store
fs = project.get_feature_store()           # Get the project's default feature store

Connected. Call `.close()` to terminate connection gracefully.


## <span style="color:#ff5f27;">🔪 Feature Selection</span>

In [63]:
# Retrieve Feature Groups
port_distance_fg = fs.get_feature_group(
    name='port_distances',
    version=1,
)

In [64]:
# Create Feature View
feature_view = fs.get_or_create_feature_view(
    name='port_fv',
    version=1,
    query=port_distance_fg.select_all(),
    labels=["distance_km"] # tell hopsworks what feature is the label
)

# Embeddings

In [70]:
# Load Graph from Neo4J
gds = GraphDataScience(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD), database=DATABASE_NAME)
port_routes = gds.graph.get("portRoutes2")

In [71]:
node_embeddings_df = gds.node2vec.stream(port_routes) 
node_embeddings_df.columns = ['node_id', 'embedding'] # rename columns for merging source side
node_embeddings_df

Unnamed: 0,node_id,embedding
0,2357,"[-0.00034653692273423076, 0.001878194278106093..."
1,2358,"[-0.0005902331904508173, -0.002221952192485332..."
2,2359,"[-0.00031994495657272637, -0.00092086417134851..."
3,2360,"[-4.374860509415157e-05, -0.00225309981033206,..."
4,2361,"[-0.00019197513756807894, -0.00068977207411080..."
...,...,...
724,1035413,"[0.0037719900719821453, -0.0027560207527130842..."
725,1036604,"[0.0003116043808404356, 7.55660657887347e-05, ..."
726,1044372,"[0.003654107917100191, 0.0020752798300236464, ..."
727,1044373,"[0.0036534080281853676, 0.0033998454455286264,..."


In [72]:
# Check embedding size
len(node_embeddings_df.embedding.values[0])

128

## <span style="color:#ff5f27;">⚙️ Feature View Creation</span>

In [73]:
query = port_embeddings_fg.select(["node_id", "target_node_id", "distance_km"])

## <span style="color:#ff5f27;">🏋️ Training Dataset</span>

In [None]:
# create a training dataset 
#X_train, X_test, y_train, y_test = feature_view.train_test_split(test_size=0.2)

X_train, X_test, y_train, y_test = feature_view.get_train_test_split(training_dataset_version=1)

---

# Feature Engineering

In [22]:
# Combine embeddings and explode features
def combine_explode_embeddings(data):
    # Convert embeddings from String to List
    if type(data['source_node_embedding'].iloc[0]) == str:
        data['source_node_embedding'] = data['source_node_embedding'].apply(literal_eval)
    if type(data['target_node_embedding'].iloc[0]) == str:
        data['target_node_embedding'] = data['target_node_embedding'].apply(literal_eval)

    # Concatenate embeddings
    data['concatenated_embedding'] = data.source_node_embedding + data.target_node_embedding

    # Remove original embedding features
    data = data.drop(['source_node_embedding', 'target_node_embedding'], axis=1)

    # Convert to Numpy array
    data['concatenated_embedding'] = data['concatenated_embedding'].apply(np.array)

    # Explode Embedding features
    data_exploded = pd.DataFrame(
        data['concatenated_embedding'].to_list(),
        columns=[f'feature_{i}' for i in range(len(data.concatenated_embedding.iloc[0]))]
    )
    return data_exploded

X_train_exploded = combine_explode_embeddings(X_train)
X_test_exploded = combine_explode_embeddings(X_test)

In [25]:
X_train_exploded.shape

(256, 256)

In [27]:
X_test_exploded.shape

(64, 256)

## <span style="color:#ff5f27;">🏃 Train Model</span>

In [39]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_exploded)
X_test_scaled = scaler.transform(X_test_exploded)

In [49]:
# Build the neural network model
model = Sequential()
model.add(Dense(64, input_dim=X_train_exploded.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))  # Use linear activation for regression

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2, validation_data=(X_test_scaled, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x2cb5cb610>

## <span style="color:#ff5f27;">👨🏻‍⚖️ Model Evaluation</span>

In [50]:
# Evaluate the model on the test set
mse = model.evaluate(X_test_scaled, y_test)
print(f'Mean Squared Error on Test Set: {mse}')

Mean Squared Error on Test Set: 177568336.0


## <span style="color:#ff5f27;">🗄 Model Registry</span>
One of the features in Hopsworks is the model registry. This is where you can store different versions of models and compare their performance.

In [53]:
# Get the model registry
mr = project.get_model_registry()

AttributeError: 'Connection' object has no attribute 'get_model_registry'

## <span style="color:#ff5f27;">⚙️ Model Schema</span>
The model needs to be set up with a Model Schema, which describes the inputs and outputs for a model.

A Model Schema can be automatically generated from training examples, as shown below.

In [54]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

# Create input schema using X_train
input_schema = Schema(X_train_exploded)

# Create output schema using y_train
output_schema = Schema(y_train)

# Create a ModelSchema object specifying the input and output schemas
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

# Convert the model schema to a dictionary
model_schema.to_dict()

{'input_schema': {'columnar_schema': [{'name': 'feature_0', 'type': 'float64'},
   {'name': 'feature_1', 'type': 'float64'},
   {'name': 'feature_2', 'type': 'float64'},
   {'name': 'feature_3', 'type': 'float64'},
   {'name': 'feature_4', 'type': 'float64'},
   {'name': 'feature_5', 'type': 'float64'},
   {'name': 'feature_6', 'type': 'float64'},
   {'name': 'feature_7', 'type': 'float64'},
   {'name': 'feature_8', 'type': 'float64'},
   {'name': 'feature_9', 'type': 'float64'},
   {'name': 'feature_10', 'type': 'float64'},
   {'name': 'feature_11', 'type': 'float64'},
   {'name': 'feature_12', 'type': 'float64'},
   {'name': 'feature_13', 'type': 'float64'},
   {'name': 'feature_14', 'type': 'float64'},
   {'name': 'feature_15', 'type': 'float64'},
   {'name': 'feature_16', 'type': 'float64'},
   {'name': 'feature_17', 'type': 'float64'},
   {'name': 'feature_18', 'type': 'float64'},
   {'name': 'feature_19', 'type': 'float64'},
   {'name': 'feature_20', 'type': 'float64'},
   {'name

In [59]:
# Specify the directory where the model files will be saved
model_dir = "neo4j_model"

# Check if the directory exists, and create it if it doesn't
if not os.path.isdir(model_dir):
    os.mkdir(model_dir)

# Specify the file name for the pickled model
pkl_file_name = model_dir + '/neural_network_model.pkl'

# Save the trained classifier using joblib
joblib.dump(model, pkl_file_name)

['neo4j_model/neural_network_model.pkl']

In [60]:
# Create a model in the model registry
model = mr.python.create_model(
    name="neo4j_neural_network",
    description="Neo4j Neural Network",
    input_example=X_train.sample(),
    model_schema=model_schema,
)

# Save the model to the specified directory
model.save(model_dir)

NameError: name 'mr' is not defined

## <span style="color:#ff5f27;">⏭️ Next: Inference Pipeline</span>
In the [following notebook](3_infence_pipeline.ipynb) you will use your model for batch inference.