In [1]:
import numpy as np
import pandas as pd

import hopsworks

In [2]:
project = hopsworks.login()
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/164




Connected. Call `.close()` to terminate connection gracefully.


In [3]:
cols = ['taxi_id', 'driver_id',
       'tip', 'tolls', 'total_fare']

In [4]:
def generate_fares_data(n_records):
    res = pd.DataFrame(columns=cols)
    
    for i in range(1, n_records + 1):
        generated_values = list()
     
        
        temp_df = pd.DataFrame.from_dict({"total_fare": [np.random.randint(3, 250)],
                                          "tip": [np.random.randint(0, 60)],
                                          "tolls": [np.random.randint(0, 6)],
                                          "taxi_id": [np.random.randint(1, 201)],
                                          "driver_id": [np.random.randint(1, 201)]
                                         })
        
        res = pd.concat([temp_df, res], ignore_index=True)
        
        
    return res

In [5]:
df_fares = generate_fares_data(100)

In [6]:
print(df_fares.head(5))

  total_fare tip tolls taxi_id driver_id
0         74  55     3     164       103
1        244   1     5       5       177
2        223  43     3     137        85
3         10  59     2     164        97
4         17  37     5      58       128


## <span style="color:#ff5f27;"> ⚖️ Great Expectations </span> 

Great Expectations’ built-in library includes more than 50 common Expectations, such as:

    expect_column_values_to_not_be_null

    expect_column_values_to_be_unique

    expect_column_median_to_be_between...

#### You can find more expectations in the [official docs](https://greatexpectations.io/expectations/)


Clean, high quality feature data is of paramount importance to being able to train and serve high quality models. Hopsworks offers integration with [Great Expectations](https://greatexpectations.io/) to enable a smooth data validation workflow.

### `More info` - [here](https://docs.hopsworks.ai/3.0/user_guides/fs/feature_group/data_validation/)

In [7]:
import great_expectations as ge

# Create (or import an existing) expectation suite using the Great Expectations library.
expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="validate_on_insert_suite"
)

In [8]:
# lets add an expecation to the 'total_fare' column
expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "total_fare",
            "min_value": 3, 
            "max_value": 5000
        }
    )
)

{"expectation_type": "expect_column_values_to_be_between", "meta": {}, "kwargs": {"column": "total_fare", "min_value": 3, "max_value": 5000}}

In [9]:
# Using Great Expectations Profiler

ge_profiler = ge.profile.BasicSuiteBuilderProfiler()
expectation_suite_profiler, _ = ge_profiler.profile(ge.from_pandas(df_fares)) # here we pass a DataFrame to validate

Profiling Columns:   0%|          | 0/5 [00:00<?, ?it/s, total_fare]

2022-08-21 22:51:19,863 INFO: 	38 expectation(s) included in expectation_suite.




In [10]:
df_fares = df_fares.astype("int64")

In [11]:
# lets load our ride_ids which were created moments ago for rides_fg
df_fares["ride_id"] = pd.read_csv("new_ride_ids.csv")["ride_id"]

In [12]:
for col in ["tip", "tolls", "total_fare"]:
    df_fares[col] = df_fares[col].astype("double")


In [13]:
fares_fg = fs.get_or_create_feature_group(name="fares_fg",
                                          version=1,
                                          primary_key=["ride_id"], 
                                          description="Taxi fares features",
                                          expectation_suite=expectation_suite,
                                          time_travel_format="HUDI",  
                                          online_enabled=True,
                                          statistics_config=True)   
fares_fg.insert(df_fares)

2022-08-21 22:51:20,467 INFO: 	1 expectation(s) included in expectation_suite.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/164/fs/106/fg/620


Uploading Dataframe: 0.00% |          | Rows 0/100 | Elapsed Time: 00:00 | Remaining Time: ?

Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/164/jobs/named/fares_fg_1_offline_fg_backfill/executions


(<hsfs.core.job.Job at 0x285c91006d0>,
 {
   "evaluation_parameters": {},
   "meta": {
     "great_expectations_version": "0.15.18",
     "expectation_suite_name": "validate_on_insert_suite",
     "run_id": {
       "run_name": null,
       "run_time": "2022-08-21T20:51:20.466521+00:00"
     },
     "batch_kwargs": {
       "ge_batch_id": "0287037a-2193-11ed-91d8-14abc5f42df5"
     },
     "batch_markers": {},
     "batch_parameters": {},
     "validation_time": "20220821T205120.466521Z",
     "expectation_suite_meta": {
       "great_expectations_version": "0.15.18"
     }
   },
   "success": true,
   "statistics": {
     "evaluated_expectations": 1,
     "successful_expectations": 1,
     "unsuccessful_expectations": 0,
     "success_percent": 100.0
   },
   "results": [
     {
       "expectation_config": {
         "expectation_type": "expect_column_values_to_be_between",
         "meta": {
           "expectationId": 239
         },
         "kwargs": {
           "column": "total