In [78]:
import hsfs
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [79]:
# Preserve original send_request method
from hsfs.client import base
send_request_original = base.Client._send_request

In [80]:
def wrap_send_request(response_instance_list):

    def _send_request_wrap(
            self,
            method,
            path_params,
            query_params=None,
            headers=None,
            data=None,
            stream=False,
            files=None,
    ):
        global send_request_original
        response_instance = RequestResponseInstance()
        
        response = send_request_original(self, method, path_params, query_params, headers, data, stream, files)
        response_instance.response = response
        response_instance.method = method
        response_instance.path_params = path_params
        response_instance.query_params = query_params
        response_instance.headers = headers
        response_instance.data = data
        response_instance.stream = stream
        response_instance.files = files
        
        response_instance_list.add(response_instance)
        
        return response

    hsfs.client.base.Client._send_request = _send_request_wrap
    
def unwrap_send_request():
    global send_request_original
    base.Client._send_request = send_request_original

In [95]:
class RequestResponseInstanceList:
    def __init__(self):
        self.items = []
        
    def add(self, response_instance):
        self.items.append(response_instance)
        
    def to_dict(self):
        d = {}
        d["items"] = list()
        for i in self.items:
            d["items"].append(i.to_dict())
        return d

class RequestResponseInstance:
    def __init__(self):
        self.response = None
        self.method = None
        self.path_params = None
        self.query_params = None
        self.headers = None
        self.data = None
        self.stream = None
        self.files = None
        
    def to_dict(self):
        d = {}
        d["response"] = self.response
        d["method"] = self.method
        d["path_params"] = self.path_params
        d["query_params"] = self.query_params
        d["headers"] = self.headers
        return d

class ResponseGenerator:
    
    def __init__(self, name):
        self.name = name

    def prepare(self):
        pass

    def call(self):
        pass

    def cleanup(self):
        pass
    
    def run(self):
        global responses_dict
        if self.name in responses_dict:
            raise Exception("fixture was already determined. remove instance from responses_dict or rename generator to continue.")
        
        response_instance_list = RequestResponseInstanceList()

        self.prepare()
        
        wrap_send_request(response_instance_list)
        self.call()
        unwrap_send_request()
        
        self.cleanup()
        
        responses_dict[self.name] = response_instance_list.to_dict()
        
        return response_instance_list

In [96]:
# specify generators...

class FeatureGroupResponseGenerator(ResponseGenerator):

    def prepare(self):

        from pyspark.sql.types import StructType, StructField, StringType, IntegerType
        data2 = [(1, "asd"),(2, "asssd"),(23, "adssd"),(1, "adsasd"),(7, "asds")]

        schema = StructType([
            StructField("intt",IntegerType(),True),
            StructField("stringt",StringType(),True)
        ])

        df = spark.createDataFrame(data=data2,schema=schema)

        from hsfs.feature import Feature
        features = [
            Feature(name="intt",type="int",online_type="int"),
            Feature(name="arrt",type="array<int>",online_type="varchar(1000)")
        ]

        features = [
            Feature(name="intt",type="int",online_type="int"),
            Feature(name="stringt",type="string",online_type="varchar(1000)")
        ]
        self.fg = fs.create_feature_group(name="fg_test",
                                     features=features,
                                     primary_key=["intt"], # key can not contain null values
                                     online_enabled=True,
                                     time_travel_format="HUDI")

        
        self.fg.save(df)

    def call(self):
        fs.get_feature_group("fg_test", version=1)
        
    def cleanup(self):
        self.fg.delete()

In [97]:
# specify generators...

class StorageConnectorResponseGenerator(ResponseGenerator):

    def prepare(self):
        pass

    def call(self):
        fs.get_storage_connector("test_project_featurestore")
        
    def cleanup(self):
        pass

In [98]:
# specify generators...

from datetime import datetime

class ExternalFeatureGroupResponseGenerator(ResponseGenerator):

    def prepare(self):
        from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
        
        data2 = [(1, "asd", datetime.strptime("2022-03-24",'%Y-%m-%d')),(2, "asssd", datetime.strptime("2022-03-20",'%Y-%m-%d')),(23, "adssd", datetime.strptime("2022-03-11",'%Y-%m-%d')),
                 (1, "adsasd", datetime.strptime("2022-03-28",'%Y-%m-%d')),(7, "asds", datetime.strptime("2022-03-1",'%Y-%m-%d'))]

        schema = StructType([
            StructField("intt",IntegerType(),True),
            StructField("stringt",StringType(),True),
            StructField("datet",DateType(),True)
        ])

        df = spark.createDataFrame(data=data2,schema=schema)

        from hsfs.feature import Feature
        features = [
            Feature(name="intt",type="int",online_type="int"),
            Feature(name="stringt",type="string",online_type="varchar(1000)"),
            Feature(name="datet",type="date",online_type="date")
        ]
        
        from hsfs import statistics_config, expectation_suite
        
        self.external_fg = fs.create_external_feature_group(
            name="external_fg_test1",
            storage_connector=fs.get_storage_connector("test_project_featurestore"),
            query="Select * from ",
            data_format="hudi",
            path="test_path",
            options={},
            version=1,
            description="test description",
            primary_key=["intt"],
            features=features,
            statistics_config=statistics_config.StatisticsConfig(),
            event_time="datet",
            expectation_suite=expectation_suite.ExpectationSuite(expectation_suite_name="test_expectation_suite_name", expectations=None, meta="{}"),
        )

        self.external_fg.save()

    def call(self):
        fs.get_external_feature_group("external_fg_test")
        
    def cleanup(self):
        self.external_fg.delete()
        pass

In [99]:
# specify generators...

from datetime import datetime

class FeatureViewResponseGenerator(ResponseGenerator):

    def prepare(self):
        from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType
        from hsfs.feature import Feature
        from hsfs.client.exceptions import RestAPIError

        # create fg 1
        try:
            fg_1 = fs.get_feature_group("fg_1_test")
        except RestAPIError:
            data_1 = [(1, "asd"),(2, "asssd"),(23, "adssd"),(1, "adsasd"),(7, "asds")]

            schema_1 = StructType([
                StructField("intt",IntegerType(),True),
                StructField("stringt",StringType(),True)
            ])

            df_1 = spark.createDataFrame(data=data_1,schema=schema_1)

            features_1 = [
                Feature(name="intt",type="int",online_type="int"),
                Feature(name="stringt",type="string",online_type="varchar(1000)")
            ]
            fg_1 = fs.create_feature_group(name="fg_1_test",
                                         features=features_1,
                                         primary_key=["intt"], # key can not contain null values
                                         online_enabled=True,
                                         time_travel_format="HUDI")

            fg_1.save(df_1)
        
        # create fg 2

        try:
            fg_2 = fs.get_feature_group("fg_2_test")
        except RestAPIError:
            data_2 = [(1, True),(2, True),(23, False),(1, True),(7, False)]

            schema_2 = StructType([
                StructField("intt",IntegerType(),True),
                StructField("boolt",BooleanType(),True)
            ])

            df_2 = spark.createDataFrame(data=data_2,schema=schema_2)

            features_2 = [
                Feature(name="intt",type="int",online_type="int"),
                Feature(name="boolt",type="boolean",online_type="boolean")
            ]
            fg_2 = fs.create_feature_group(name="fg_2_test",
                                         features=features_2,
                                         primary_key=["intt"], # key can not contain null values
                                         online_enabled=True,
                                         time_travel_format="HUDI")

            fg_2.save(df_2)
        
        # fv
        query = fg_1.select_all().join(fg_2.select_all())
        self.feature_view = fs.get_or_create_feature_view(
            name='fv_test',
            query=query,
            version=1
        )

    def call(self):
        fs.get_feature_view('fv_test')
        
    def cleanup(self):
        self.feature_view.delete()

In [111]:
# specify generators...

from datetime import datetime

class TrainingDatasetResponseGenerator(ResponseGenerator):

    def prepare(self):
        from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType
        from hsfs.feature import Feature
        from hsfs.client.exceptions import RestAPIError

        # create fg 1
        try:
            fg_1 = fs.get_feature_group("fg_1_test")
        except RestAPIError:
            data_1 = [(1, "asd"),(2, "asssd"),(23, "adssd"),(1, "adsasd"),(7, "asds")]

            schema_1 = StructType([
                StructField("intt",IntegerType(),True),
                StructField("stringt",StringType(),True)
            ])

            df_1 = spark.createDataFrame(data=data_1,schema=schema_1)

            features_1 = [
                Feature(name="intt",type="int",online_type="int"),
                Feature(name="stringt",type="string",online_type="varchar(1000)")
            ]
            fg_1 = fs.create_feature_group(name="fg_1_test",
                                         features=features_1,
                                         primary_key=["intt"], # key can not contain null values
                                         online_enabled=True,
                                         time_travel_format="HUDI")

            fg_1.save(df_1)
        
        # create fg 2

        try:
            fg_2 = fs.get_feature_group("fg_2_test")
        except RestAPIError:
            data_2 = [(1, True),(2, True),(23, False),(1, True),(7, False)]

            schema_2 = StructType([
                StructField("intt",IntegerType(),True),
                StructField("boolt",BooleanType(),True)
            ])

            df_2 = spark.createDataFrame(data=data_2,schema=schema_2)

            features_2 = [
                Feature(name="intt",type="int",online_type="int"),
                Feature(name="boolt",type="boolean",online_type="boolean")
            ]
            fg_2 = fs.create_feature_group(name="fg_2_test",
                                         features=features_2,
                                         primary_key=["intt"], # key can not contain null values
                                         online_enabled=True,
                                         time_travel_format="HUDI")

            fg_2.save(df_2)
        
        # fv

        query = fg_1.select_all().join(fg_2.select_all())
            
        self.td = out_fs.create_training_dataset(name=td_name, description="derived td description", data_format="csv", version=1, statistics_config=False)
self.td.save(td_query)
        

    def call(self):
        fs.get_training_dataset('fv_test')
        
    def cleanup(self):
        pass

In [112]:
responses_dict = {}

# run generators...
#FeatureGroupResponseGenerator("get_feature_group").run()
#StorageConnectorResponseGenerator("get_storage_connector").run()
#ExternalFeatureGroupResponseGenerator("get_external_feature_group").run()
#FeatureViewResponseGenerator("get_feature_view").run()
TrainingDatasetResponseGenerator("get_training_dataset").run()

An error was encountered:
Metadata operation error: (url: https://hopsworks.glassfish.service.consul:8182/hopsworks-api/api/project/119/featurestores/67/trainingdatasets/fv_test). Server response: 
HTTP code: 404, HTTP reason: Not Found, error code: 270012, error msg: Training dataset wasn't found., user msg: training dataset name : fv_test
Traceback (most recent call last):
  File "<stdin>", line 59, in run
  File "<stdin>", line 83, in call
  File "/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/hsfs/feature_store.py", line 294, in get_training_dataset
    return self._training_dataset_api.get(name, version)
  File "/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/hsfs/core/training_dataset_api.py", line 57, in get
    _client._send_request("GET", path_params, query_params),
  File "<stdin>", line 16, in _send_request_wrap
  File "/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/hsfs/decorators.py", line 35, in if_connected
    return fn(inst, *args, **

In [104]:
import json

print(print(json.dumps(responses_dict, indent=4, separators=(',',': '))))

{}
None

In [88]:
# write responses captured with the generators to 'backend_fixtures.json'
'''
import json
import pydoop.hdfs as hdfs

filename = f'hdfs:///Projects/{fs.project_name}/Resources/backend_fixtures.json'
with hdfs.open(filename, 'wt') as json_file:
    json.dump(responses_dict, json_file, 
                        indent=4,  
                        separators=(',',': '))
'''

"\nimport json\nimport pydoop.hdfs as hdfs\n\nfilename = f'hdfs:///Projects/{fs.project_name}/Resources/backend_fixtures.json'\nwith hdfs.open(filename, 'wt') as json_file:\n    json.dump(responses_dict, json_file, \n                        indent=4,  \n                        separators=(',',': '))\n"