In [1]:
import hopsworks
from datetime import datetime

In [2]:
project = hopsworks.login()
fs = project.get_feature_store()

2024-10-18 06:33:29,494 INFO: Python Engine initialized.

Logged in to project, explore it here https://demo.hops.works/p/123


In [3]:
transactions = fs.get_feature_group("transactions", version=1)
profiles = fs.get_feature_group("profiles", version=1)
profiles_activity_5m = fs.get_feature_group("profiles_activity_5m", version=1)
profiles_last_transaction = fs.get_feature_group("profiles_last_transaction", version=1)

In [7]:
query = transactions.select(['fraud_label', 'amount', 'category', 'time_delta_t_minus_1', 'loc_delta_t_minus_1', 'is_outside_city']).join(
    profiles.select_features(), on='account_id'
).join(
    profiles_activity_5m.select_features(), on='account_id'
).join(
    # Use Left join type as we don't need this data in the training dataset. This is only here to populate the inference helper columns
    profiles_last_transaction.select_all(include_primary_key=False), on='account_id', join_type="left", prefix="last_"
)

2024-10-18 06:34:44,497 INFO: Using ['cc_provider', 'cc_type', 'city', 'age', 'cc_expiration_days'] as features for the query.To include primary key and event time use `select_all`.
2024-10-18 06:34:44,499 INFO: Using ['count_trans', 'min_amount', 'max_amount', 'avg_amount'] as features for the query.To include primary key and event time use `select_all`.


In [8]:
query.show(5)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.81s) 


Unnamed: 0,fraud_label,amount,category,time_delta_t_minus_1,loc_delta_t_minus_1,is_outside_city,cc_provider,cc_type,city,age,cc_expiration_days,count_trans,min_amount,max_amount,avg_amount,last_transaction_datetime,last_latitude,last_longitude
0,0,70.99,Clothing,13.0,1403.668192,1.0,mastercard,debit,Lower West Side,39.589041,836,1.0,32.21,32.21,32.21,2024-10-10 13:31:58+00:00,41.75338,-86.11084
1,0,99.2,Grocery,9.0,1089.012447,1.0,mastercard,debit,Lompoc,87.29863,1595,1.0,18.19,18.19,18.19,2024-10-09 07:16:10+00:00,33.54428,-84.23381
2,0,32.6,Health/Beauty,0.0,429.477442,1.0,visa,credit,Evergreen Park,29.413699,287,1.0,6.53,6.53,6.53,2024-10-11 22:59:56+00:00,40.5576,-74.28459
3,0,88.61,Grocery,2.0,1287.527437,1.0,visa,credit,Martinsburg,54.627397,1261,1.0,11.31,11.31,11.31,2024-10-12 12:26:05+00:00,41.75338,-86.11084
4,0,191.32,Electronics,2.0,2177.818849,1.0,visa,debit,Santa Maria,59.717808,106,1.0,91.16,91.16,91.16,2024-10-14 12:12:12+00:00,39.32288,-76.72803


In [9]:
# Load transformation functions to encode categorical features
label_encoder = fs.get_transformation_function(name="label_encoder")

In [10]:
# Register the feature view with the feature store
fraud_model_fv = fs.create_feature_view(
    name="fraud_model_stream_fv",
    version=1,
    description="Fraud model feature view",
    query=query,
    labels=['fraud_label'],
    inference_helper_columns=['city', 'last_transaction_datetime', 'last_latitude', 'last_longitude'],
    transformation_functions = [
        label_encoder("category"),
        label_encoder("cc_provider"),
        label_encoder("cc_type")
    ]
)

Feature view created successfully, explore it at 
https://demo.hops.works/p/123/fs/68/fv/fraud_model_stream_fv/version/1


In [11]:
# Create training dataset for the model. 
# Split train and test data by time
train_start = datetime(year=2023, month=8, day=1, hour=0, minute=0, second=0)
train_end = datetime(year=2024, month=9, day=30, hour=0, minute=0, second=0)
test_start = datetime(year=2024, month=10, day=1, hour=0, minute=0, second=0)
test_end = datetime(year=2024, month=10, day=10, hour=0, minute=0, second=0)

fraud_model_fv.create_train_test_split(
    train_start = train_start,
    train_end = train_end,
    test_start = test_start,
    test_end = test_end,
    data_format = "csv",
    coalesce = True,
    statistics_config = {'histograms': True, 'correlations': True}
)

Training dataset job started successfully, you can follow the progress at 
https://demo.hops.works/p/123/jobs/named/fraud_model_stream_fv_1_create_fv_td_18102024063542/executions
2024-10-18 06:35:52,107 INFO: Waiting for execution to finish. Current state: INITIALIZING. Final status: UNDEFINED
2024-10-18 06:35:55,199 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2024-10-18 06:35:58,282 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2024-10-18 06:43:29,124 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2024-10-18 06:43:29,259 INFO: Waiting for log aggregation to finish.
2024-10-18 06:43:53,865 INFO: Execution finished successfully.



(1, Job('fraud_model_stream_fv_1_create_fv_td_18102024063542', 'PYSPARK'))