In [1]:
import hopsworks
import pandas as pd
import json
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Function that generates some similated event data
from generate_data import generate_events, generate_users

## Inserting the JSON into a feature group

#### Connect to Hopsworks

In [2]:
# Login to Hopsworks.
project = hopsworks.login()

# Fetch the feature store.
fs = project.get_feature_store()

2025-03-12 22:48:39,080 INFO: Initializing external client
2025-03-12 22:48:39,081 INFO: Base URL: https://10.87.41.143:28181
2025-03-12 22:48:39,694 INFO: Python Engine initialized.

Logged in to project, explore it here https://10.87.41.143:28181/p/119


#### Create a dataframe using the json data.

In [3]:
# Generating simulated json data
data = generate_events(20)
data

['{"event_time": "2025-03-12T21:48:40.177349", "event_id": 0, "user_id": 1, "click_count": 8, "time_spent": 7.72, "scroll_depth": 13.95, "purchase_amount": 51.25}',
 '{"event_time": "2025-03-12T21:48:40.177399", "event_id": 1, "user_id": 24, "click_count": 17, "time_spent": 3.06, "scroll_depth": 42.19, "purchase_amount": 14.9}',
 '{"event_time": "2025-03-12T21:48:40.177446", "event_id": 2, "user_id": 7, "click_count": 7, "time_spent": 15.41, "scroll_depth": 2.65, "purchase_amount": 99.42}',
 '{"event_time": "2025-03-12T21:48:40.177558", "event_id": 3, "user_id": 21, "click_count": 17, "time_spent": 12.88, "scroll_depth": 44.92, "purchase_amount": 139.1}',
 '{"event_time": "2025-03-12T21:48:40.177579", "event_id": 4, "user_id": 28, "click_count": 0, "time_spent": 22.88, "scroll_depth": 15.97, "purchase_amount": 211.31}',
 '{"event_time": "2025-03-12T21:48:40.177595", "event_id": 5, "user_id": 9, "click_count": 4, "time_spent": 6.85, "scroll_depth": 76.35, "purchase_amount": 51.11}',
 '{

In [4]:
# Creating a pandas dataframe
df = pd.DataFrame({"data" : data})

# Extract primary key out json and create a new column in the pandas dataframe. 
df["event_id"] = df["data"].apply(lambda x : json.loads(x)["event_id"])

#### Create a feature group and insert data

By default the json column is stored as a `STRING` which is stored as `VARCHAR(100)` in the offline feature store. So insertion of any json data that has length greater than 100 characters will result in an error. However this can be overcome [explcitly specifing the schema](https://docs.hopsworks.ai/latest/user_guides/fs/feature_group/data_types/#explicit-schema-definition) of the feature group to allow the column to take a much bigger json type. Different data types that can be used for storing json data can be found in the documentation [here](https://docs.hopsworks.ai/latest/user_guides/fs/feature_group/data_types/#string-online-data-types).  

In [5]:
# Create a feature group.
fg = fs.get_or_create_feature_group(name = "fg_raw_event_data" ,
                                    version = 1, 
                                    primary_key = ["event_id"])

In [6]:
# Insert the dataframe
fg.insert(df, wait=True)

Feature Group created successfully, explore it at 
https://10.87.41.143:28181/p/119/fs/67/fg/30


Uploading Dataframe: 100.00% |█| Rows 20/20 | Elapsed Time: 00:00 | Remaining Ti


Launching job: fg_raw_event_data_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://10.87.41.143:28181/p/119/jobs/named/fg_raw_event_data_1_offline_fg_materialization/executions
2025-03-12 22:48:52,412 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-03-12 22:48:55,495 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-03-12 22:50:21,746 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-03-12 22:50:21,814 INFO: Waiting for log aggregation to finish.
2025-03-12 22:50:30,233 INFO: Execution finished successfully.


(Job('fg_raw_event_data_1_offline_fg_materialization', 'SPARK'), None)

#### Inserting more data with a more features

In [7]:
# Generating some more data with a new features and inserting it into a feature group
data = generate_events(20)
data

['{"event_time": "2025-03-12T21:50:30.255624", "event_id": 20, "user_id": 5, "click_count": 16, "time_spent": 15.06, "scroll_depth": 75.58, "purchase_amount": 430.55}',
 '{"event_time": "2025-03-12T21:50:30.255671", "event_id": 21, "user_id": 5, "click_count": 20, "time_spent": 5.22, "scroll_depth": 68.05, "purchase_amount": 298.2, "ad_interaction": 3}',
 '{"event_time": "2025-03-12T21:50:30.255709", "event_id": 22, "user_id": 13, "click_count": 19, "time_spent": 29.89, "scroll_depth": 52.91, "purchase_amount": 485.54, "ad_interaction": 0}',
 '{"event_time": "2025-03-12T21:50:30.255723", "event_id": 23, "user_id": 22, "click_count": 3, "time_spent": 20.61, "scroll_depth": 53.7, "purchase_amount": 133.41, "ad_interaction": 5}',
 '{"event_time": "2025-03-12T21:50:30.255746", "event_id": 24, "user_id": 11, "click_count": 3, "time_spent": 9.16, "scroll_depth": 15.82, "purchase_amount": 1.62, "ad_interaction": 5}',
 '{"event_time": "2025-03-12T21:50:30.255760", "event_id": 25, "user_id": 29

In [8]:
# Creating a pandas dataframe
df = pd.DataFrame({"data" : data})

# Extract primary key out json and create a new column in the pandas dataframe. 
df["event_id"] = df["data"].apply(lambda x : json.loads(x)["event_id"])

# Insert the dataframe
fg.insert(df)

Uploading Dataframe: 100.00% |█| Rows 20/20 | Elapsed Time: 00:00 | Remaining Ti


Launching job: fg_raw_event_data_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://10.87.41.143:28181/p/119/jobs/named/fg_raw_event_data_1_offline_fg_materialization/executions


(Job('fg_raw_event_data_1_offline_fg_materialization', 'SPARK'), None)

## Creating the first test model

#### Option 1: Create a feature view using the json data. The perform the json parsing after creating train test-split

In [9]:
# Write a query to read data from required feature group.
query = fg.select_features()

# Create a feature view.
fv = fs.get_or_create_feature_view(name="fv_test_1", 
                                   query = query, 
                                   version=1)

2025-03-12 22:50:36,918 INFO: Using ['data'] as features for the query.To include primary key and event time use `select_all`.
Feature view created successfully, explore it at 
https://10.87.41.143:28181/p/119/fs/67/fv/fv_test_1/version/1


In [10]:
# Create a train-test split 
X_train, X_test, _, _ = fv.train_test_split(test_size = 0.2)

# Json normalize the extracted dataframe.
X_train = pd.json_normalize(X_train["data"].apply(lambda x : json.loads(x)))
X_test = pd.json_normalize(X_test["data"].apply(lambda x : json.loads(x)))

# Parse the required features
y_train = X_train[["purchase_amount"]]
y_test = X_test[["purchase_amount"]]

X_train = X_train[["click_count", "time_spent", "scroll_depth"]]
X_test = X_test[["click_count", "time_spent", "scroll_depth"]]

# Do any feature enginerring if required .....

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.28s) 



In [11]:
# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

#### Option 2: Create a feature view using the have a model dependent transformation function that performs the extract and splitting of json data into features.

However it is important to note that aggregations inside transformation functions are not currently supported.
You can find more information regarding transformation functions here: https://docs.hopsworks.ai/latest/user_guides/fs/transformation_functions/

In [12]:
@hopsworks.udf([int, int ,int, int], mode="pandas", drop=["data"])
def extract_features(data):
    # Need to be imported inside the function only if function defined in a jupyter notebook otherwise can be imported inside the file instead of the funciton.
    from pandas import json_normalize
    import json
    
    # Normalize the data
    data = json_normalize(data.apply(lambda x : json.loads(x)))

    # Return the features
    return data["click_count"], data["time_spent"], data["scroll_depth"], data["purchase_amount"]




In [13]:
fv = fs.get_or_create_feature_view(name="fv_test_2", 
                                   query = fg.select_features(), 
                                   version=1, 
                                   transformation_functions = [extract_features("data").alias("click_count", "time_spent", "scroll_depth", "purchase_amount")])

2025-03-12 22:50:40,765 INFO: Using ['data'] as features for the query.To include primary key and event time use `select_all`.
Feature view created successfully, explore it at 
https://10.87.41.143:28181/p/119/fs/67/fv/fv_test_2/version/1


In [14]:
X_train, X_test, _, _ = fv.train_test_split(test_size = 0.2)

# Parse the required features
y_train = X_train[["purchase_amount"]]
y_test = X_test[["purchase_amount"]]

X_train = X_train[["click_count", "time_spent", "scroll_depth"]]
X_test = X_test[["click_count", "time_spent", "scroll_depth"]]

# Do any feature enginerring if required .....



Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.24s) 



In [15]:
# Train a model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse

17703.271255897707