In [1]:
import hopsworks
from datetime import datetime

In [2]:
project = hopsworks.login()
fs = project.get_feature_store()

2024-10-17 15:29:47,339 INFO: Python Engine initialized.

Logged in to project, explore it here https://demo.hops.works/p/123


In [3]:
transactions = fs.get_feature_group("transactions", version=1)
profiles = fs.get_feature_group("profiles", version=1)
profiles_last_transaction = fs.get_feature_group("profiles_last_transaction", version=1)

In [4]:
query = transactions.select(['fraud_label', 'amount', 'category', 'time_delta_t_minus_1', 'loc_delta_t_minus_1', 'is_outside_city']).join(
    profiles.select_all(include_primary_key=False, include_event_time=False), on='account_id'
).join(
    # Use Left join type as we don't need this data in the training dataset. This is only here to populate the inference helper columns
    profiles_last_transaction.select_all(include_primary_key=False), on='account_id', join_type="left", prefix="last_"
)

In [5]:
query.show(5)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.03s) 


Unnamed: 0,fraud_label,amount,category,time_delta_t_minus_1,loc_delta_t_minus_1,is_outside_city,cc_provider,cc_type,city,age,cc_expiration_days,last_transaction_datetime,last_latitude,last_longitude
0,0,61.38,Grocery,0.0,3388.966515,1.0,mastercard,credit,Front Royal,66.583562,1717,2024-10-03 14:27:02+00:00,47.80527,-122.24064
1,0,1.14,Grocery,0.0,692.574282,1.0,visa,credit,Far Rockaway,36.035616,683,2024-10-05 03:49:52+00:00,42.58342,-71.8023
2,0,48.04,Grocery,10.0,0.0,1.0,visa,debit,Abilene,27.564384,-78,2024-10-15 17:37:33+00:00,43.16547,-77.70066
3,0,69.76,Grocery,1.0,1575.204164,1.0,mastercard,debit,Norwalk,64.89863,14,2024-10-15 07:08:33+00:00,39.32288,-76.72803
4,0,43.34,Restaurant/Cafeteria,2.0,563.618853,1.0,visa,credit,Front Royal,54.753425,1139,2024-10-11 09:21:49+00:00,30.17746,-81.38758


In [6]:
label_encoder = fs.get_transformation_function(name="label_encoder")

In [7]:
# Register the feature view with the feature store
fraud_model_fv = fs.create_feature_view(
    name="fraud_model_no_streaming_fv",
    version=1,
    description="Fraud model feature view",
    query=query,
    labels=['fraud_label'],
    inference_helper_columns=['city', 'last_transaction_datetime', 'last_latitude', 'last_longitude'],
    transformation_functions = [
        label_encoder("category"),
        label_encoder("cc_provider"),
        label_encoder("cc_type")
    ]
)

Feature view created successfully, explore it at 
https://demo.hops.works/p/123/fs/68/fv/fraud_model_no_streaming_fv/version/1


In [8]:
# Create training dataset for the model. 
# Split train and test data by time
train_start = datetime(year=2023, month=8, day=1, hour=0, minute=0, second=0)
train_end = datetime(year=2024, month=9, day=30, hour=0, minute=0, second=0)
test_start = datetime(year=2024, month=10, day=1, hour=0, minute=0, second=0)
test_end = datetime(year=2024, month=10, day=10, hour=0, minute=0, second=0)

fraud_model_fv.create_train_test_split(
    train_start = train_start,
    train_end = train_end,
    test_start = test_start,
    test_end = test_end,
    data_format = "csv",
    coalesce = True,
    statistics_config = {'histograms': True, 'correlations': True}
)

Training dataset job started successfully, you can follow the progress at 
https://demo.hops.works/p/123/jobs/named/fraud_model_no_streaming_fv_1_create_fv_td_17102024153030/executions
2024-10-17 15:30:40,541 INFO: Waiting for execution to finish. Current state: INITIALIZING. Final status: UNDEFINED
2024-10-17 15:30:43,663 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2024-10-17 15:30:46,783 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2024-10-17 15:36:29,260 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2024-10-17 15:36:29,320 INFO: Waiting for log aggregation to finish.
2024-10-17 15:36:37,857 INFO: Execution finished successfully.



(1,
 Job('fraud_model_no_streaming_fv_1_create_fv_td_17102024153030', 'PYSPARK'))