In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import hopsworks

In [2]:
project = hopsworks.login()
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/164




Connected. Call `.close()` to terminate connection gracefully.


In [3]:
fares_fg = fs.get_or_create_feature_group(
        name = 'fares_fg',
        version = 1
    )

In [4]:
old_df_fares = fares_fg.select_all().read()

2022-08-16 23:16:36,054 INFO: USE `romankah_featurestore`
2022-08-16 23:16:37,030 INFO: SELECT `fg0`.`ride_id` `ride_id`, `fg0`.`taxi_id` `taxi_id`, `fg0`.`driver_id` `driver_id`, `fg0`.`tip` `tip`, `fg0`.`tolls` `tolls`, `fg0`.`total_fare` `total_fare`
FROM `romankah_featurestore`.`fares_fg_1` `fg0`




In [5]:
old_df_fares.head(5)

Unnamed: 0,ride_id,taxi_id,driver_id,tip,tolls,total_fare
0,5139,2013000188,2013000188,6.0,0.0,19.0
1,5526,2013000053,2013000053,15.0,0.0,52.0
2,3910,2013000023,2013000023,20.0,3.0,73.0
3,29577,2013000019,2013000019,18.0,0.0,63.0
4,29441,2013000085,2013000085,20.0,0.0,69.0


In [6]:
last_ride_id = old_df_fares.ride_id.max()

In [7]:
print("Last ride_id:", last_ride_id)

Last ride_id: 41178


In [8]:
cols = ['ride_id', 'taxi_id', 'driver_id',
       'tip', 'tolls', 'total_fare']

In [9]:
def generate_fares_data(n_records):
    res = pd.DataFrame(columns=cols)
    
    for i in range(1, n_records + 1):
        generated_values = list()
     
        
        temp_df = pd.DataFrame.from_dict({"ride_id": [last_ride_id + i],
                                          "total_fare": [np.random.randint(3, 250)],
                                          "tip": [np.random.randint(0, 60)],
                                          "tolls": [np.random.randint(0, 6)],
                                          "taxi_id": [np.random.choice(old_df_fares.taxi_id.unique())],
                                          "driver_id": [np.random.choice(old_df_fares.driver_id.unique())]
                                         })
        
        res = pd.concat([temp_df, res], ignore_index=True)
        
    return res

In [10]:
df_fares = generate_fares_data(100)

In [11]:
print(df_fares)

   ride_id total_fare tip tolls     taxi_id   driver_id
0    41278        213   3     3  2013000053  2013000094
1    41277        205  30     3  2013000002  2013000038
2    41276        117  58     2  2013000097  2013000095
3    41275         33   2     3  2013000088  2013000107
4    41274         60  58     2  2013000112  2013000009
..     ...        ...  ..   ...         ...         ...
95   41183        200  50     0  2013000037  2013000020
96   41182        247   9     2  2013000118  2013000157
97   41181        102  36     4  2013000026  2013000079
98   41180        232  18     2  2013000112  2013000077
99   41179        182  14     0  2013000110  2013000064

[100 rows x 6 columns]


## <span style="color:#ff5f27;"> ⚖️ Great Expectations </span> 

Great Expectations’ built-in library includes more than 50 common Expectations, such as:

    expect_column_values_to_not_be_null

    expect_column_values_to_be_unique

    expect_column_median_to_be_between...

#### You can find more expectations in the [official docs](https://greatexpectations.io/expectations/)


Clean, high quality feature data is of paramount importance to being able to train and serve high quality models. Hopsworks offers integration with [Great Expectations](https://greatexpectations.io/) to enable a smooth data validation workflow.

### `More info` - [here](https://docs.hopsworks.ai/3.0/user_guides/fs/feature_group/data_validation/)

In [12]:
import great_expectations as ge

# Create (or import an existing) expectation suite using the Great Expectations library.
expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="validate_on_insert_suite"
)

In [13]:
# lets add an expecation to the 'total_fare' column
expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "total_fare",
            "min_value": 3, 
            "max_value": 5000
        }
    )
)

{"meta": {}, "kwargs": {"column": "total_fare", "min_value": 3, "max_value": 5000}, "expectation_type": "expect_column_values_to_be_between"}

In [14]:
# Using Great Expectations Profiler

ge_profiler = ge.profile.BasicSuiteBuilderProfiler()
expectation_suite_profiler, _ = ge_profiler.profile(ge.from_pandas(df_fares)) # here we pass a DataFrame to validate

Profiling Columns:   0%|          | 0/6 [00:00<?, ?it/s, ride_id]

2022-08-16 23:16:56,905 INFO: 	45 expectation(s) included in expectation_suite.




In [15]:
# lets attach an expectation suite to your Feature Group.
# It enables persistence of the expectation suite to the Hopsworks backend.
fares_fg.save_expectation_suite(expectation_suite)

Attached expectation suite to featuregroup, edit it at https://c.app.hopsworks.ai:443/p/164/fs/106/fg/596


In [16]:
df_fares = df_fares.astype("int64")

In [17]:
for col in ["tip", "tolls", "total_fare"]:
    df_fares[col] = df_fares[col].astype("double")


In [18]:
fares_fg.insert(df_fares)

2022-08-16 23:16:57,457 INFO: 	1 expectation(s) included in expectation_suite.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/164/fs/106/fg/596


Uploading Dataframe: 0.00% |          | Rows 0/100 | Elapsed Time: 00:00 | Remaining Time: ?

Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/164/jobs/named/fares_fg_1_offline_fg_backfill/executions


(<hsfs.core.job.Job at 0x255c4015ee0>,
 {
   "success": true,
   "results": [
     {
       "success": true,
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       },
       "expectation_config": {
         "meta": {
           "expectationId": 181
         },
         "kwargs": {
           "column": "total_fare",
           "min_value": 3,
           "max_value": 5000
         },
         "expectation_type": "expect_column_values_to_be_between"
       },
       "meta": {},
       "result": {
         "element_count": 100,
         "missing_count": 0,
         "missing_percent": 0.0,
         "unexpected_count": 0,
         "unexpected_percent": 0.0,
         "unexpected_percent_total": 0.0,
         "unexpected_percent_nonmissing": 0.0,
         "partial_unexpected_list": []
       }
     }
   ],
   "statistics": {
     "evaluated_expectations": 1,
     "successful_expectations": 1,
     "unsucces