In [1]:
import hopsworks
import pandas as pd
import json
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Function that generates some similated event data
from generate_data import generate_events, generate_users

## Inserting the JSON into a feature group

#### Connect to Hopsworks

In [2]:
# Login to Hopsworks.
project = hopsworks.login()

# Fetch the feature store.
fs = project.get_feature_store()

2025-03-13 09:32:13,190 INFO: Initializing external client
2025-03-13 09:32:13,191 INFO: Base URL: https://10.87.41.143:28181
2025-03-13 09:32:14,776 INFO: Python Engine initialized.

Logged in to project, explore it here https://10.87.41.143:28181/p/119


#### Create a dataframe using the json data.

In [3]:
# Generating simulated json data
data = generate_events(20)
data

['{"event_time": "2028-02-11 00:00:00", "event_id": 0, "user_id": 1, "click_count": 7, "time_spent": 7.08, "scroll_depth": 73.65, "purchase_completed": 0, "checkout_time": "2028-02-11 04:00:00"}',
 '{"event_time": "2024-06-22 13:00:00", "event_id": 1, "user_id": 14, "click_count": 0, "time_spent": 3.26, "scroll_depth": 23.27, "purchase_completed": 0, "checkout_time": "2024-06-22 17:00:00"}',
 '{"event_time": "2033-09-19 05:00:00", "event_id": 2, "user_id": 7, "click_count": 17, "time_spent": 12.88, "scroll_depth": 44.92, "purchase_completed": 1, "checkout_time": "2033-09-19 11:00:00"}',
 '{"event_time": "2024-02-05 11:00:00", "event_id": 3, "user_id": 28, "click_count": 5, "time_spent": 21.1, "scroll_depth": 34.03, "purchase_completed": 0, "checkout_time": "2024-02-05 12:00:00"}',
 '{"event_time": "2029-01-12 06:00:00", "event_id": 4, "user_id": 25, "click_count": 3, "time_spent": 3.24, "scroll_depth": 9.67, "purchase_completed": 1, "checkout_time": "2029-01-12 10:00:00"}',
 '{"event_t

In [4]:
# Creating a pandas dataframe
df = pd.DataFrame({"data" : data})

# Extract primary key out json and create a new column in the pandas dataframe. 
df["event_id"] = df["data"].apply(lambda x : json.loads(x)["event_id"])

#### Create a feature group and insert data

By default the json column is stored as a `STRING` which is stored as `VARCHAR(100)` in the offline feature store. So insertion of any json data that has length greater than 100 characters will result in an error. However this can be overcome [explcitly specifing the schema](https://docs.hopsworks.ai/latest/user_guides/fs/feature_group/data_types/#explicit-schema-definition) of the feature group to allow the column to take a much bigger json type. Different data types that can be used for storing json data can be found in the documentation [here](https://docs.hopsworks.ai/latest/user_guides/fs/feature_group/data_types/#string-online-data-types).  

In [5]:
# Create a feature group.
fg = fs.get_or_create_feature_group(name = "fg_raw_event_data" ,
                                    version = 1, 
                                    primary_key = ["event_id"])

In [6]:
# Insert the dataframe
fg.insert(df, wait=True)

Feature Group created successfully, explore it at 
https://10.87.41.143:28181/p/119/fs/67/fg/36


Uploading Dataframe: 100.00% |███████████████████████████████████████████████| Rows 20/20 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: fg_raw_event_data_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://10.87.41.143:28181/p/119/jobs/named/fg_raw_event_data_1_offline_fg_materialization/executions
2025-03-13 09:32:41,053 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-03-13 09:32:44,466 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-03-13 09:34:07,894 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-03-13 09:34:08,044 INFO: Waiting for log aggregation to finish.
2025-03-13 09:34:17,098 INFO: Execution finished successfully.


(Job('fg_raw_event_data_1_offline_fg_materialization', 'SPARK'), None)

#### Inserting more data with a more features

In [7]:
# Generating some more data with a new features and inserting it into a feature group.
# You can see from the data that the new feature `ad_interaction` is present in the data.
data = generate_events(20)
data

['{"event_time": "2027-12-09 10:00:00", "event_id": 20, "user_id": 29, "click_count": 16, "time_spent": 22.98, "scroll_depth": 50.77, "purchase_completed": 0, "checkout_time": "2027-12-09 16:00:00"}',
 '{"event_time": "2028-06-17 21:00:00", "event_id": 21, "user_id": 21, "click_count": 20, "time_spent": 15.48, "scroll_depth": 19.89, "purchase_completed": 1, "checkout_time": "2028-06-18 03:00:00", "ad_interaction": 1}',
 '{"event_time": "2031-12-06 10:00:00", "event_id": 22, "user_id": 18, "click_count": 0, "time_spent": 18.17, "scroll_depth": 48.86, "purchase_completed": 0, "checkout_time": "2031-12-06 12:00:00", "ad_interaction": 2}',
 '{"event_time": "2024-11-12 08:00:00", "event_id": 23, "user_id": 8, "click_count": 7, "time_spent": 26.4, "scroll_depth": 94.69, "purchase_completed": 0, "checkout_time": "2024-11-12 13:00:00", "ad_interaction": 3}',
 '{"event_time": "2025-01-12 23:00:00", "event_id": 24, "user_id": 27, "click_count": 17, "time_spent": 23.09, "scroll_depth": 12.84, "pu

In [8]:
# Creating a pandas dataframe
df = pd.DataFrame({"data" : data})

# Extract primary key out json and create a new column in the pandas dataframe. 
df["event_id"] = df["data"].apply(lambda x : json.loads(x)["event_id"])

# Insert the dataframe
fg.insert(df)

Uploading Dataframe: 100.00% |███████████████████████████████████████████████| Rows 20/20 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: fg_raw_event_data_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://10.87.41.143:28181/p/119/jobs/named/fg_raw_event_data_1_offline_fg_materialization/executions


(Job('fg_raw_event_data_1_offline_fg_materialization', 'SPARK'), None)

## Creating the first test model

#### Option 1: Create a feature view using the json data. The perform the json parsing after creating train test-split

In [9]:
# Write a query to read data from required feature group.
query = fg.select_features()

# Create a feature view.
fv = fs.get_or_create_feature_view(name="fv_test_1", 
                                   query = query, 
                                   version=1)

2025-03-13 09:35:18,199 INFO: Using ['data'] as features for the query.To include primary key and event time use `select_all`.
Feature view created successfully, explore it at 
https://10.87.41.143:28181/p/119/fs/67/fv/fv_test_1/version/1


In [11]:
# Create a train-test split 
X_train, X_test, _, _ = fv.train_test_split(test_size = 0.2)

# Json normalize the extracted dataframe.
X_train = pd.json_normalize(X_train["data"].apply(lambda x : json.loads(x)))
X_test = pd.json_normalize(X_test["data"].apply(lambda x : json.loads(x)))

# Parse the required features
y_train = X_train[["purchase_completed"]]
y_test = X_test[["purchase_completed"]]

X_train = X_train[["click_count", "time_spent", "scroll_depth"]]
X_test = X_test[["click_count", "time_spent", "scroll_depth"]]

# Do any feature enginerring if required and train the required mode

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.58s) 

2025-03-13 09:35:47,035 INFO: Provenance cached data - overwriting last accessed/created training dataset from 1 to 2.


#### Option 2: Create a feature view using the have a model dependent transformation function that performs the extract and splitting of json data into features.

However it is important to note that aggregations inside transformation functions are not currently supported.
You can find more information regarding transformation functions [here](https://docs.hopsworks.ai/latest/user_guides/fs/transformation_functions/)

In [17]:
@hopsworks.udf([int, int ,int, int], mode="pandas", drop=["data"])
def extract_features(data):
    # Need to be imported inside the function only if the transformation function defined in a jupyter notebook otherwise can be imported inside the module instead of the funciton.
    from pandas import json_normalize
    import json
    
    # Normalize the data
    data = json_normalize(data.apply(lambda x : json.loads(x)))

    # Return the features
    return data["click_count"], data["time_spent"], data["scroll_depth"], data["purchase_completed"]




In [22]:
# Creating a feature view with the transformation function.
fv = fs.get_or_create_feature_view(name="fv_test_2", 
                                   query = fg.select_features(), 
                                   version=1, 
                                   transformation_functions = [extract_features("data").alias("click_count", "time_spent", "scroll_depth", "purchase_completed")])

2025-03-13 09:37:30,983 INFO: Using ['data'] as features for the query.To include primary key and event time use `select_all`.
Feature view created successfully, explore it at 
https://10.87.41.143:28181/p/119/fs/67/fv/fv_test_2/version/1


In [23]:
X_train, X_test, _, _ = fv.train_test_split(test_size = 0.2)

# Parse the required features
y_train = X_train[["purchase_completed"]]
y_test = X_test[["purchase_completed"]]

X_train = X_train[["click_count", "time_spent", "scroll_depth"]]
X_test = X_test[["click_count", "time_spent", "scroll_depth"]]

# Do any feature enginering if required and train your model .....



Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.52s) 

