Before running this noetbook makes sure that the following packages are installed in your Python enviroment:

* `snowflake`
* `snowflake-snowpark-python`
* `snowflake-ml-python`
* `matplotlib`

In [5]:
# Import python packages
import matplotlib.pyplot as plt

from snowflake.ml.feature_store import (
    FeatureStore,
    FeatureView,
    Entity,
    CreationMode)

from snowflake.snowpark import Session
import snowflake.snowpark.functions as snow_funcs

### Connect to Snowflake

This example is using the connections.toml file to connect to Snowflake. You can read more at https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-connect#connecting-using-the-connections-toml-file how to set it up.

In [None]:
db_name = "SNOWPARK_DEMO_DB"
schema_name = "SIMPLE_ML_SCHEMA"
fs_schema_name = "SIMPLE_FS_SCHEMA"
wh_name = "SIMPLE_ML_WH"
CONNECTION_NAME = '<YOUR CONNECTION NAME>' # Name of the connection in connections.toml to be used to connect to Snowflake

session = Session.builder.config("connection_name", CONNECTION_NAME).create()
session.use_schema(f'{db_name}.{schema_name}')
session.use_warehouse(wh_name)
session.get_fully_qualified_current_schema()

# Data Exploration

In [None]:
# Create a Snowpark DataFrames
print('Customer General Data:')
cust_df = session.table(f'{db_name}.{schema_name}.CUSTOMER_GENERAL_DATA')
# Ny column mebership_start 
cust_df.show(n=2)

print('Customer Behavior Data:')
behavior_df = session.table(f'{db_name}.{schema_name}.CUSTOMER_BEHAVIOR_DATA')
behavior_df.show(n=2)

In [None]:
# Describe data using various statistics
print('Statistical Data Analysis:')
cust_df.describe().show()
behavior_df.describe().show()

In [None]:
plot_cols = ['GENDER', 'MEMBERSHIP_STATUS']
for col in plot_cols:
    cust_df.group_by(col).count().to_pandas().plot.bar(x=col, y="COUNT")

# Feature Engineering

In [11]:
# Connect to Feature Store
fs = FeatureStore(
    session=session, 
    database=db_name, 
    name=fs_schema_name, 
    default_warehouse=wh_name,
    creation_mode=CreationMode.FAIL_IF_NOT_EXIST,
)

Check if we have any Entities registred

In [None]:
fs.list_entities().show()

Check for feature views

In [None]:
fs.list_feature_views().show()

In [None]:
# Create a new Feature Definition that captures the number of days since joining 
cust_feature_df = cust_df.with_column('MEMBERSHIP_LENGTH_DAYS', snow_funcs.current_date()-snow_funcs.col('MEMBER_JOIN_DATE'))
cust_feature_df.limit(n=3).show()

Create description for each of the features, that we are going to use when registering them

In [17]:
feature_descriptions = {'MEMBERSHIP_LENGTH_DAYS': 'The number of days since the customer joined the membership program, calculated as the difference between the current date and the member join date.',
                        'GENDER': 'The customers gender, either MALE or FEMALE.',
                        'MEMBERSHIP_STATUS': 'The customers current membership status, which can be one of the following levels: BRONZE, BASIC, SILVER, GOLD, or PLATIN.'}


In [None]:
# Create a new entity for the Feature Store
entity = Entity(name="CUSTOMER", join_keys=["EMAIL"], desc='Unique identifier for customers.')
fs.register_entity(entity)
fs.list_entities().show()

We are now first creating a FeatreView that will be automatically updated once a minute, if there are new data, attache descriptions and then register it which will kick of the materilization of the feature values.

Since we are using a non-deterministic function, CURRENT_DATE, the vew can not be incerementally updated but will do a full refresh every tinme we have new data.

In [None]:
# Create Feature View
cust_fv = FeatureView(
    name="CUSTOMER_GENERAL_DATA_FEATURES", 
    entities=[entity],
    feature_df=cust_feature_df['EMAIL','GENDER','MEMBERSHIP_STATUS','MEMBERSHIP_LENGTH_DAYS'], 
    refresh_freq="1 minute",  # can also be a cron schedule - * * * * * America/Los_Angeles
    desc="Ecommerce Customer General Features")

# Add descriptions for some features
cust_fv = cust_fv.attach_feature_desc(feature_descriptions)

cust_fv = fs.register_feature_view(
    feature_view=cust_fv, 
    version="V1", 
    block=True,
    overwrite=True)

### Register Customer Behavior Features

In [None]:
# Create a flag that tells if the customer prefer the app over web based on the avg minutes spent on those
behavior_features_df = (behavior_df['EMAIL','AVG_SESSION_LENGTH_MIN','AVG_TIME_ON_APP_MIN','AVG_TIME_ON_WEBSITE_MIN']
                        .with_column('APP_PRIMARY', snow_funcs.iff(snow_funcs.col('AVG_TIME_ON_APP_MIN') > snow_funcs.col('AVG_TIME_ON_WEBSITE_MIN'), 1, 0)))
behavior_features_df.limit(10).show()

In [None]:
behavior_features_df.group_by('APP_PRIMARY').count().to_pandas().plot.bar(x='APP_PRIMARY')

In [23]:
# Create Feature View
behavior_fv = FeatureView(
    name="CUSTOMER_BEHAVIOR_DATA_FEATURES", 
    entities=[entity],
    feature_df=behavior_features_df, 
    refresh_freq="1 minute",  # can also be a cron schedule - * * * * * America/Los_Angeles
    desc="Ecommerce Customer Behavior Features")

# Add descriptions for some features
behavior_fv = behavior_fv.attach_feature_desc(
    {
        "AVG_SESSION_LENGTH_MIN":"Average Session Length in Minutes.",
        "AVG_TIME_ON_APP_MIN":"Average Time a customer spends in the app per day.",
        "AVG_TIME_ON_WEBSITE_MIN":"Average Time a customers spend on the website per day.",
        "APP_PRIMARY":"If a customer uses the app more than the website",
    }
)

behavior_fv = fs.register_feature_view(
    feature_view=behavior_fv, 
    version="V1", 
    block=True,
    overwrite=True)

In [None]:
fs.list_entities().show()

In [None]:
fs.list_feature_views().show()

In [None]:
behavior_fv.lineage('upstream')