In [7]:
import hopsworks
import pandas as pd
import json

## Creating a feature group from the feature group with Json Data

#### Login to Hopsworks and fetch feature group

In [4]:
# Login to Hopsworks.
project = hopsworks.login()

# Fetch the feature store.
fs = project.get_feature_store()

# Fetch the feature group
fg_json = fs.get_feature_group(name="fg_raw_event_data", version=1)

Connection closed.
2025-03-12 20:18:51,407 INFO: Python Engine initialized.

Logged in to project, explore it here https://hopsworks.ai.local/p/119


#### Read data from the feature group and perform required feature enginerring

In [5]:
# Read from feature group
df = fg_json.read()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.18s) 


In [23]:
# Extract the json data convert it into a dataframe with required columns
unnested_dataframe = pd.json_normalize(df["data"].apply(lambda x : json.loads(x)))
unnested_dataframe.head(2)

Unnamed: 0,event_time,event_id,user_id,click_count,time_spent,scroll_depth,purchase_amount,ad_interaction
0,2025-03-12T17:10:57.851375,1,4,17,3.06,42.19,14.9,
1,2025-03-12T17:10:57.851693,17,11,6,19.84,39.56,457.27,


In [24]:
# Perform required feature enginering
unnested_dataframe = unnested_dataframe[["event_time", "event_id", "user_id", "click_count", "time_spent", "scroll_depth", "purchase_amount"]]
# Convert string datetime to datetime object
unnested_dataframe["event_time"] = pd.to_datetime(unnested_dataframe["event_time"])

user_event_df = unnested_dataframe[["event_time", "event_id", "user_id", "purchase_amount"]]

events_df = unnested_dataframe[["event_time", "event_id", "click_count", "time_spent", "scroll_depth"]]


#### Create feature groups

There are two feature groups being created.
1. **An user-events feature group** : This feature group store all events for an user. The feature group has the primary key as `user_id` hence the online feature store will only contain the latest events for the user and the offline feature group will contain all event triggered by the user.
2. **An events feature group**: This feature group will conatin all information regarding the event. This feature group will have the primary key as `event_id`. Hence can be joined with the user-events feature group to create the entire dataframe.

Splitting the data like this also allows creating a seperate *users feature group* which could contain user specific details which can again be joined to create a feature view.

In [25]:
# Creating user-events feature group
fg_user_events = fs.get_or_create_feature_group(name = "fg_user_events",
                                                version = 1, 
                                                primary_key = ["user_id"],
                                                event_time = ["event_time"],
                                                online_enabled=True)
# Creating events feature group
fg_events = fs.get_or_create_feature_group(name = "fg_events",
                                                version = 1, 
                                                primary_key = ["event_id"],
                                                event_time = ["event_time"],
                                                online_enabled=True)


# Inserting data into the feature groups
fg_user_events.insert(user_event_df)
fg_events.insert(events_df)


Feature Group created successfully, explore it at 
https://hopsworks.ai.local/p/119/fs/67/fg/26


Uploading Dataframe: 100.00% |██████████| Rows 40/40 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: fg_user_events_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://hopsworks.ai.local/p/119/jobs/named/fg_user_events_1_offline_fg_materialization/executions
Feature Group created successfully, explore it at 
https://hopsworks.ai.local/p/119/fs/67/fg/27


Uploading Dataframe: 100.00% |██████████| Rows 40/40 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: fg_events_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://hopsworks.ai.local/p/119/jobs/named/fg_events_1_offline_fg_materialization/executions


(Job('fg_events_1_offline_fg_materialization', 'SPARK'), None)

## Creating a feature view and generating train-test data

#### Define a query to join feature groups

The joins performed by Hopsworks are always point in time correct base on event time. Hence you can easily join the the user-events and the users feature groups to create a new feature view that has point int time correct data.

In [28]:
query = fg_user_events.select("purchase_amount").join(fg_events.select_features(), prefix="event_data_", on="event_id")
query.show(5)

2025-03-12 21:01:10,775 INFO: Using ['click_count', 'time_spent', 'scroll_depth'] as features for the query.To include primary key and event time use `select_all`.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.39s) 


Unnamed: 0,purchase_amount,event_data_click_count,event_data_time_spent,event_data_scroll_depth
0,69.77,0,22.38,24.49
1,177.64,20,25.11,16.27
2,438.18,2,6.72,94.29
3,304.49,20,9.3,63.89
4,157.73,1,7.26,3.21


#### Creating feature view

In [31]:
# Import any require model-dependent transformation functions
from hopsworks.hsfs.builtin_transformations import min_max_scaler

# Create feature view
fv = fs.get_or_create_feature_view(name="fv_events", 
                                   version = 1, 
                                   query = query, 
                                   transformation_functions=[
                                       min_max_scaler("event_data_click_count"), 
                                       min_max_scaler("event_data_time_spent"), 
                                       min_max_scaler("event_data_scroll_depth")
                                   ],
                                   labels = ["purchase_amount"])


# Create training data
X_train, X_test, y_train, y_test = fv.train_test_split(test_size = 0.2)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.29s) 



## Testing a new feature

You can easily test a new fetaure without disturbing existing feature groups by creating a seperate feature group with only the new tests feature, this feature group can be joined with existing feature groups to create a new feature view. Once testing is done, the new feature can then be appended to an existing feature group can be used to create and backfill an new feature group.

In [38]:
# Fetch the feature group with the raw json
fg_json = fs.get_feature_group(name="fg_raw_event_data", version=1)

# Read the data from the raw feature group.
df = fg_json.read()
unnested_dataframe = pd.json_normalize(df["data"].apply(lambda x : json.loads(x)))
unnested_dataframe["event_time"] = pd.to_datetime(unnested_dataframe["event_time"])

# Extract the new feature that is required to be tested.
test_df = unnested_dataframe[["event_time", "event_id", "ad_interaction"]]

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.23s) 


In [39]:
# Create the new test feature_group
fg_test = fs.get_or_create_feature_group(name = "fg_test",
                                                version = 1, 
                                                primary_key = ["event_id"],
                                                event_time = ["event_time"],
                                                online_enabled=True)

fg_test.insert(test_df)


Feature Group created successfully, explore it at 
https://hopsworks.ai.local/p/119/fs/67/fg/28


Uploading Dataframe: 100.00% |██████████| Rows 40/40 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: fg_test_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://hopsworks.ai.local/p/119/jobs/named/fg_test_1_offline_fg_materialization/executions


(Job('fg_test_1_offline_fg_materialization', 'SPARK'), None)

In [46]:
# Create a new testing feature view that joins the feature groups : fg_user_events, fg_events and fg_test
query = fg_user_events.select("purchase_amount").join(fg_events.select_features(), prefix="event_data_", on="event_id").join(fg_test.select_features(), prefix="testing_", on="event_id")
query.show(5)

2025-03-12 21:21:47,220 INFO: Using ['click_count', 'time_spent', 'scroll_depth'] as features for the query.To include primary key and event time use `select_all`.
2025-03-12 21:21:47,221 INFO: Using ['ad_interaction'] as features for the query.To include primary key and event time use `select_all`.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.42s) 


In [49]:
# Create testing feature view
fv_test = fs.get_or_create_feature_view(name="fv_test", 
                                   version = 1, 
                                   query = query, 
                                   transformation_functions=[
                                       min_max_scaler("event_data_click_count"), 
                                       min_max_scaler("event_data_time_spent"), 
                                       min_max_scaler("event_data_scroll_depth"),
                                       min_max_scaler("testing_ad_interaction")
                                   ],
                                   labels = ["purchase_amount"])

X_train, X_test, y_train, y_test = fv_test.train_test_split(test_size = 0.2)

Feature view created successfully, explore it at 
https://hopsworks.ai.local/p/119/fs/67/fv/fv_test/version/1
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.37s) 



### Appending new features to the feature view
Once a feature has been testing and wants to be included in the feature group it can be done appending the feature to the feature group or by creating a new version of the feature group that contains the new feature.

In [65]:
from hopsworks.hsfs.feature import Feature

In [66]:
features = [Feature(name="ad_interaction",type="double",online_type="double")]

In [67]:
# Appending a new feature to the feature group
fg_events.append_features(features)

2025-03-12 21:32:12,791 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-03-12 21:32:15,838 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-03-12 21:33:16,697 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-03-12 21:33:16,732 INFO: Waiting for log aggregation to finish.
2025-03-12 21:33:24,964 INFO: Execution finished successfully.


<hsfs.feature_group.FeatureGroup at 0x7f5517d038e0>

In [68]:
# Now backfill the required data into the feature group
# Read the data from the raw feature group.
df = fg_json.read()
unnested_dataframe = pd.json_normalize(df["data"].apply(lambda x : json.loads(x)))
unnested_dataframe["event_time"] = pd.to_datetime(unnested_dataframe["event_time"])

events_df = unnested_dataframe[["event_time", "event_id", "click_count", "time_spent", "scroll_depth", "ad_interaction"]]

# Insert the data into the feature group
fg_events.insert(events_df)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.22s) 


FeatureStoreException: Features are not compatible with Feature Group schema: 
 - ad_interaction (expected type: 'int', derived from input: 'double') has the wrong type.
Note that feature (or column) names are case insensitive and spaces are automatically replaced with underscores.

In [69]:
#### Creating a new version of the feature group

# Creating events feature group
fg_events_v2 = fs.get_or_create_feature_group(name = "fg_events",
                                                version = 2, 
                                                primary_key = ["event_id"],
                                                event_time = ["event_time"],
                                                online_enabled=True)





In [70]:
# Insert the required data into the new feature group version
# Read the data from the raw feature group.
df = fg_json.read()
unnested_dataframe = pd.json_normalize(df["data"].apply(lambda x : json.loads(x)))
unnested_dataframe["event_time"] = pd.to_datetime(unnested_dataframe["event_time"])

events_df = unnested_dataframe[["event_time", "event_id", "click_count", "time_spent", "scroll_depth", "ad_interaction"]]

# Insert the data into the feature group
fg_events_v2.insert(events_df)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.21s) 
Feature Group created successfully, explore it at 
https://hopsworks.ai.local/p/119/fs/67/fg/29


Uploading Dataframe: 100.00% |██████████| Rows 40/40 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: fg_events_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://hopsworks.ai.local/p/119/jobs/named/fg_events_2_offline_fg_materialization/executions


(Job('fg_events_2_offline_fg_materialization', 'SPARK'), None)