## <span style='color:#ff5f27'> 📝 Imports

In [None]:
import pandas as pd
import numpy as np
import great_expectations as ge
from great_expectations.core import ExpectationSuite, ExpectationConfiguration

## <span style='color:#ff5f27'> 👥 Fetch Users Data</span>

In [None]:
data_users_df = pd.read_parquet('https://repo.hops.works/dev/davit/tiktok_recsys/users.parquet')
data_users_df.head()

## <span style="color:#ff5f27">👮🏻‍♂️ Great Expectations </span>

In [None]:
# Create a Great Expectations DataFrame from the pandas DataFrame
ge_users_df = ge.from_pandas(data_users_df)

# Initialize the expectation suite
expectation_suite_users = ge_users_df.get_expectation_suite()
expectation_suite_users.expectation_suite_name = "user_data_suite"

# Expectation: Age should be between 0 and 120
expectation_suite_users.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={"column": "age", "min_value": 12, "max_value": 100}
    )
)

# Expectations: Columns should not have null values
for column in ge_users_df.columns:
    expectation_suite_users.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_not_be_null",
            kwargs={"column": column}
        )
    )

# Expectation: Gender should only contain specific values
expectation_suite_users.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_distinct_values_to_be_in_set",
        kwargs={"column": "gender", "value_set": ["Male", "Female", "Other"]}
    )
)

## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [None]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()

## <span style="color:#ff5f27">🪄 Feature Group Creation </span>


In [None]:
users_fg = fs.get_or_create_feature_group(
    name="users",
    version=1,
    description="Users data.",
    primary_key=["user_id"],
    partition_key=["registration_month"],
    event_time="registration_date",
    online_enabled=True,
    expectation_suite=expectation_suite_users,
    statistics_config = {
            "enabled": True,
            "histograms": True,
            "correlations": True,
        }    
)

users_fg.insert(data_users_df)
print('Done ✅')

In [None]:
feature_descriptions = [
    {"name": "user_id", "description": "Unique identifier for each user."},
    {"name": "gender", "description": "Gender of the user."},
    {"name": "age", "description": "Age of the user."},
    {"name": "country", "description": "Country of Residence of the user."},
    {"name": "registration_date", "description": "Date of registration."},
    {"name": "registration_month", "description": "Month of registration derived from registration_date."},
]

for desc in feature_descriptions: 
    users_fg.update_feature_description(desc["name"], desc["description"])

---