In [1]:
import hsfs
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
21,application_1624276539905_0001,pyspark,idle,Link,Link


SparkSession available as 'spark'.
Connected. Call `.close()` to terminate connection gracefully.

In [4]:
from hops import hdfs
from pyspark.sql import functions as F

# hdfs:///Projects/testholuser20/testholuser20_Training_Datasets/teams_features.csv
teams_csv = spark.read\
             .option("inferSchema", "true")\
             .option("header", "true")\
             .format("csv")\
             .load("hdfs:///Projects/{}/testcaixa2_Training_Datasets//teams_features.csv".format(hdfs.project_name()))

In [5]:
teams_csv.count()

50

In [6]:
online_teams_fg_meta = fs.create_feature_group(name="teams_features",
                                       version=1,
                                       primary_key=['team_id'],
                                       description="Store related features",
                                       online_enabled=True,
                                       time_travel_format=None,
                                       statistics_config={"enabled": True, "histograms": True, "correlations": True})

In [7]:
online_teams_fg_meta.save(teams_csv)

<hsfs.feature_group.FeatureGroup object at 0x7fad39a6d950>

In [14]:
country_csv = spark.read\
             .option("inferSchema", "true")\
             .option("header", "true")\
             .format("csv")\
             .option("delimiter",";")\
             .load("hdfs:///Projects/{}/testcaixa2_Training_Datasets//data_cleaned_train.csv".format(hdfs.project_name()))

In [15]:
country_csv.count()

10592

In [16]:
online_name_country_fg_meta = fs.create_feature_group(name="name_country_fg",
                                       version=1,
                                       primary_key=['first_name', 'last_name'],
                                       description="Name - Country prediction",
                                       online_enabled=True,
                                       time_travel_format=None,
                                       statistics_config={"enabled": True, "histograms": True, "correlations": True})

In [17]:
online_name_country_fg_meta.save(country_csv)

<hsfs.feature_group.FeatureGroup object at 0x7fad39a635d0>

# Get

In [8]:
teams_features = fs.get_feature_group("teams_features",version=1)
teams_features.show(5)

+-----------+-------+-------------+
|team_budget|team_id|team_position|
+-----------+-------+-------------+
|  12957.076|      1|            1|
|  2403.3704|      2|            2|
|  3390.3755|      3|            3|
|  13547.429|      4|            4|
|   9678.333|      5|            5|
+-----------+-------+-------------+
only showing top 5 rows

In [18]:
print("Name: {}".format(name_country_fg.name))
print("Description: {}".format(name_country_fg.description))
print("Features:")
features = name_country_fg.features
for feature in features:
    print("{:<60} \t Primary: {} \t Partition: {}".format(feature.name, feature.primary, feature.partition))

Name: name_country_fg
Description: Name - Country prediction
Features:

# Feature Data Validation

In [19]:
from hsfs.rule import Rule
rules = connection.get_rules()
[print(rule.to_dict()) for rule in rules]

{'name': 'HAS_MIN', 'predicate': 'VALUE', 'acceptedType': 'Fractional', 'description': 'Assert on the min of a feature.'}
{'name': 'HAS_MAX', 'predicate': 'VALUE', 'acceptedType': 'Fractional', 'description': 'Assert on the max of a feature.'}
{'name': 'HAS_COMPLETENESS', 'predicate': 'VALUE', 'acceptedType': 'Fractional', 'description': 'Assert on the uniqueness of a single or combined set of features.'}
{'name': 'HAS_UNIQUE_VALUE_RATIO', 'predicate': 'VALUE', 'acceptedType': 'Fractional', 'description': 'Assert on the unique value ratio of of a single or combined set of features.'}
{'name': 'HAS_NUMBER_OF_DISTINCT_VALUES', 'predicate': 'VALUE', 'acceptedType': 'Integral', 'description': 'Assert on the number of distinct values of a feature.'}
{'name': 'HAS_ENTROPY', 'predicate': 'VALUE', 'acceptedType': 'Fractional', 'description': 'Assert on the entropy of a feature.'}
{'name': 'HAS_MUTUAL_INFORMATION', 'predicate': 'LEGAL_VALUES', 'acceptedType': 'Fractional', 'description': 'Asser

In [20]:
expectation_countries = fs.create_expectation("countries",
                                          description="min and max number of countries",
                                          features=["country"], 
                                          rules=[Rule(name="HAS_NUMBER_OF_DISTINCT_VALUES", level="ERROR", min=1), 
                                                 Rule(name="HAS_NUMBER_OF_DISTINCT_VALUES", level="ERROR", max=195)])
expectation_countries.save()

expectation.rules[0].to_dict(){'name': 'HAS_NUMBER_OF_DISTINCT_VALUES', 'level': 'ERROR', 'min': 1, 'max': None, 'value': None, 'pattern': None, 'acceptedType': None, 'legalValues': None}
ExpectationsApi.expectation.to_dict(){'name': 'countries', 'description': 'min and max number of countries', 'features': ['country'], 'rules': [<hsfs.rule.Rule object at 0x7fad3371a8d0>, <hsfs.rule.Rule object at 0x7fad3371a050>]}
ExpectationsApi.expectation.rules[0].to_dict(){'name': 'HAS_NUMBER_OF_DISTINCT_VALUES', 'level': 'ERROR', 'min': 1, 'max': None, 'value': None, 'pattern': None, 'acceptedType': None, 'legalValues': None}
ExpectationsApi.expectation.payload{"name": "countries", "description": "min and max number of countries", "features": ["country"], "rules": [{"name": "HAS_NUMBER_OF_DISTINCT_VALUES", "level": "ERROR", "min": 1, "max": null, "value": null, "pattern": null, "acceptedType": null, "legalValues": null}, {"name": "HAS_NUMBER_OF_DISTINCT_VALUES", "level": "ERROR", "min": null, "ma

In [21]:
name_country_fg.attach_expectation(expectation_countries)

An error was encountered:
Metadata operation error: (url: https://hopsworks.glassfish.service.consul:8182/hopsworks-api/api/project/124/featurestores/68/featuregroups/None/expectations/countries). Server response: 
HTTP code: 404, HTTP reason: Not Found, error code: 120004, error msg: Web application exception occurred, user msg: HTTP 404 Not Found
Traceback (most recent call last):
  File "/srv/hops/anaconda/envs/theenv/lib/python3.7/site-packages/hsfs/feature_group.py", line 897, in attach_expectation
    return self._expectations_api.attach(self, expectation.name)
  File "/srv/hops/anaconda/envs/theenv/lib/python3.7/site-packages/hsfs/core/expectations_api.py", line 77, in attach
    _client._send_request("PUT", path_params)
  File "/srv/hops/anaconda/envs/theenv/lib/python3.7/site-packages/hsfs/decorators.py", line 35, in if_connected
    return fn(inst, *args, **kwargs)
  File "/srv/hops/anaconda/envs/theenv/lib/python3.7/site-packages/hsfs/client/base.py", line 147, in _send_requ

In [22]:
# Create a Pandas Dataframe and ingest its features into a feature group that you create here.  
import pandas as pd 
columns = ['first_name', 'last_name', 'country']
data = [['tom', 'johnson', 'UK'], ['penelope', 'charles', 'UK'], ['harry', 'windsor', "USA"]]   
df = pd.DataFrame(data, columns=columns) 
name_country_fg.insert(df)

An error was encountered:
An error occurred while calling o371.save.
: org.apache.hudi.exception.HoodieException: 'hoodie.table.name', 'path' must be set.
	at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:75)
	at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:134)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
