Before running this Notebook make sure you have the following packages in the **Packages** list

* `matploylib`
* `snowflake-ml-python`

In [70]:
# Import python packages
import streamlit as st

from snowflake.ml.feature_store import (
    FeatureStore,
    FeatureView,
    Entity,
    CreationMode)

# from snowflake.snowpark import Session
import snowflake.snowpark.functions as snow_funcs

# Get the Snowpark session
from snowflake.snowpark.context import get_active_session
session = get_active_session()


In [71]:
# session = Session.builder.config("connection_name", 'mstellwall_aws_us_west3').create()
print("Current role: " + session.get_current_role() + ", Current schema: " + session.get_fully_qualified_current_schema() + ", Current WH: " + session.get_current_warehouse())

Current role: "SYSADMIN", Current schema: "DEMO_DB"."PUBLIC", Current WH: "COMPUTE_WH"


In [72]:
db_name = "SNOWPARK_DEMO_DB"
schema_name = "SIMPLE_ML_SCHEMA"
fs_schema_name = "SIMPLE_FS_SCHEMA"
wh_name = "SIMPLE_ML_WH"

session.use_schema(f'{db_name}.{schema_name}')
session.use_warehouse(wh_name)
session.get_fully_qualified_current_schema()

'"SNOWPARK_DEMO_DB"."SIMPLE_ML_SCHEMA"'

# Data Exploration

In [73]:
# Create a Snowpark DataFrames
print('Customer General Data:')
cust_df = session.table(f'{db_name}.{schema_name}.CUSTOMER_GENERAL_DATA')
# Ny column mebership_start 
cust_df.show(n=2)

print('Customer Behavior Data:')
behavior_df = session.table(f'{db_name}.{schema_name}.CUSTOMER_BEHAVIOR_DATA')
behavior_df.show(n=2)

Customer General Data:
------------------------------------------------------------------------------
|"EMAIL"               |"GENDER"  |"MEMBERSHIP_STATUS"  |"MEMBER_JOIN_DATE"  |
------------------------------------------------------------------------------
|ISEglO3tBE@jRkXJ.com  |MALE      |BASIC                |2020-03-21          |
|blarIyWxnz@EyabS.com  |MALE      |BRONZE               |2024-02-03          |
------------------------------------------------------------------------------

Customer Behavior Data:
-------------------------------------------------------------------------------------------------------
|"EMAIL"               |"AVG_SESSION_LENGTH_MIN"  |"AVG_TIME_ON_APP_MIN"  |"AVG_TIME_ON_WEBSITE_MIN"  |
-------------------------------------------------------------------------------------------------------
|ISEglO3tBE@jRkXJ.com  |3.2003                    |1.2003                 |6.2003                     |
|blarIyWxnz@EyabS.com  |5.0769                    |NULL       

In [74]:
# Describe data using various statistics
print('Statistical Data Analysis:')
cust_df.describe().show()
behavior_df.describe().show()

Statistical Data Analysis:
---------------------------------------------------------------------
|"SUMMARY"  |"EMAIL"               |"GENDER"  |"MEMBERSHIP_STATUS"  |
---------------------------------------------------------------------
|count      |200000                |200000    |200000               |
|mean       |NULL                  |NULL      |NULL                 |
|stddev     |NULL                  |NULL      |NULL                 |
|min        |000P87Apg4@wNfMv.com  |FEMALE    |BASIC                |
|max        |zzzzJ0vK6u@idfFj.com  |MALE      |SILVER               |
---------------------------------------------------------------------

-------------------------------------------------------------------------------------------------------------------
|"SUMMARY"  |"EMAIL"               |"AVG_SESSION_LENGTH_MIN"  |"AVG_TIME_ON_APP_MIN"  |"AVG_TIME_ON_WEBSITE_MIN"  |
--------------------------------------------------------------------------------------------------------------

In [None]:
ncol = 2
cols = st.columns(ncol)

plot_cols = ['GENDER', 'MEMBERSHIP_STATUS']
for i in range(len(plot_cols)):
    st_col = cols[i%ncol]
    with st_col:
        col = plot_cols[i]
        st.bar_chart(cust_df.group_by(col).count(), x=col)

# Feature Engineering

In [75]:
# Connect to Feature Store
fs = FeatureStore(
    session=session, 
    database=db_name, 
    name=fs_schema_name, 
    default_warehouse=wh_name,
    creation_mode=CreationMode.FAIL_IF_NOT_EXIST,
)

In [76]:
fs.list_entities()

------------------------------------------------------------------------
|"NAME"    |"JOIN_KEYS"  |"DESC"                            |"OWNER"   |
------------------------------------------------------------------------
|CUSTOMER  |["EMAIL"]    |Unique identifier for customers.  |SYSADMIN  |
------------------------------------------------------------------------



In [77]:
fs.list_feature_views()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAME"                           |"VERSION"  |"DATABASE_NAME"   |"SCHEMA_NAME"     |"CREATED_ON"                |"OWNER"   |"DESC"                                |"ENTITIES"    |"REFRESH_FREQ"  |"REFRESH_MODE"  |"SCHEDULING_STATE"  |"WAREHOUSE"   |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|CUSTOMER_BEHAVIOR_DATA_FEATURES  |V1         |SNOWPARK_DEMO_DB  |SIMPLE_FS_SCHEMA  |2024-09-12 02:23:53.571000  |SYSADMIN  |Ecommerce Customer Behavior Features  |[             |1 minute        |INCREMENTAL     |ACTIVE              |SIMPLE_ML_WH  |


In [78]:
# Create a new Feature Definition that captures the number of days since joining 
cust_feature_df = cust_df.with_column('MEMBERSHIP_LENGTH_DAYS', snow_funcs.current_date()-snow_funcs.col('MEMBER_JOIN_DATE'))
cust_feature_df.limit(n=3)

---------------------------------------------------------------------------------------------------------
|"EMAIL"               |"GENDER"  |"MEMBERSHIP_STATUS"  |"MEMBER_JOIN_DATE"  |"MEMBERSHIP_LENGTH_DAYS"  |
---------------------------------------------------------------------------------------------------------
|ISEglO3tBE@jRkXJ.com  |MALE      |BASIC                |2020-03-21          |1636                      |
|blarIyWxnz@EyabS.com  |MALE      |BRONZE               |2024-02-03          |222                       |
|1JMXH8GV0T@4GzHW.com  |MALE      |PLATIN               |2023-03-01          |561                       |
---------------------------------------------------------------------------------------------------------



In [79]:
feature_descriptions = {'MEMBERSHIP_LENGTH_DAYS': 'The number of days since the customer joined the membership program, calculated as the difference between the current date and the member join date.',
                        'GENDER': 'The customers gender, either MALE or FEMALE.',
                        'MEMBERSHIP_STATUS': 'The customers current membership status, which can be one of the following levels: BRONZE, BASIC, SILVER, GOLD, or PLATIN.'}


In [80]:
# Create a new entity for the Feature Store
entity = Entity(name="CUSTOMER", join_keys=["EMAIL"], desc='Unique identifier for customers.')
fs.register_entity(entity)
fs.list_entities()

  return f(self, *args, **kargs)


------------------------------------------------------------------------
|"NAME"    |"JOIN_KEYS"  |"DESC"                            |"OWNER"   |
------------------------------------------------------------------------
|CUSTOMER  |["EMAIL"]    |Unique identifier for customers.  |SYSADMIN  |
------------------------------------------------------------------------



In [81]:
# Create Feature View
cust_fv = FeatureView(
    name="CUSTOMER_GENERAL_DATA_FEATURES", 
    entities=[entity],
    feature_df=cust_feature_df['EMAIL','GENDER','MEMBERSHIP_STATUS','MEMBERSHIP_LENGTH_DAYS'], 
    refresh_freq="1 minute",  # can also be a cron schedule - * * * * * America/Los_Angeles
    desc="Ecommerce Customer General Features")

# Add descriptions for some features
cust_fv = cust_fv.attach_feature_desc(feature_descriptions)

cust_fv = fs.register_feature_view(
    feature_view=cust_fv, 
    version="V1", 
    block=True,
    overwrite=True)

  self._check_dynamic_table_refresh_mode(feature_view_name)


### Register Customer Behavior Features

In [82]:
# Create a flag that tells if the customer prefer the app over web based on the avg minutes spent on those
behavior_features_df = (behavior_df['EMAIL','AVG_SESSION_LENGTH_MIN','AVG_TIME_ON_APP_MIN','AVG_TIME_ON_WEBSITE_MIN']
                        .with_column('APP_PRIMARY', snow_funcs.iff(snow_funcs.col('AVG_TIME_ON_APP_MIN') > snow_funcs.col('AVG_TIME_ON_WEBSITE_MIN'), 1, 0)))
behavior_features_df.limit(10)

-----------------------------------------------------------------------------------------------------------------------
|"EMAIL"               |"AVG_SESSION_LENGTH_MIN"  |"AVG_TIME_ON_APP_MIN"  |"AVG_TIME_ON_WEBSITE_MIN"  |"APP_PRIMARY"  |
-----------------------------------------------------------------------------------------------------------------------
|ISEglO3tBE@jRkXJ.com  |3.2003                    |1.2003                 |6.2003                     |0              |
|blarIyWxnz@EyabS.com  |5.0769                    |NULL                   |5.0769                     |0              |
|1JMXH8GV0T@4GzHW.com  |6.2404                    |7.2404                 |10.2404                    |0              |
|gWvxW1OROO@sgaaN.com  |9.3691                    |NULL                   |11.3691                    |0              |
|mgCmEen8Fk@1PxIr.com  |4.8437                    |10.8437                |10.8437                    |0              |
|9AXLbqyQMx@Wf3g9.com  |7.4813          

In [83]:
st.bar_chart(behavior_features_df.group_by('APP_PRIMARY').count(), x='APP_PRIMARY')

---------------------------
|"APP_PRIMARY"  |"COUNT"  |
---------------------------
|0              |162171   |
|1              |37829    |
---------------------------



In [84]:
# Create Feature View
behavior_fv = FeatureView(
    name="CUSTOMER_BEHAVIOR_DATA_FEATURES", 
    entities=[entity],
    feature_df=behavior_features_df, 
    refresh_freq="1 minute",  # can also be a cron schedule - * * * * * America/Los_Angeles
    desc="Ecommerce Customer Behavior Features")

# Add descriptions for some features
behavior_fv = behavior_fv.attach_feature_desc(
    {
        "AVG_SESSION_LENGTH_MIN":"Average Session Length in Minutes.",
        "AVG_TIME_ON_APP_MIN":"Average Time a customer spends in the app per day.",
        "AVG_TIME_ON_WEBSITE_MIN":"Average Time a customers spend on the website per day.",
        "APP_PRIMARY":"If a customer uses the app more than the website",
    }
)

behavior_fv = fs.register_feature_view(
    feature_view=behavior_fv, 
    version="V1", 
    block=True,
    overwrite=True)

In [85]:
fs.list_entities()

------------------------------------------------------------------------
|"NAME"    |"JOIN_KEYS"  |"DESC"                            |"OWNER"   |
------------------------------------------------------------------------
|CUSTOMER  |["EMAIL"]    |Unique identifier for customers.  |SYSADMIN  |
------------------------------------------------------------------------



In [87]:
fs.list_feature_views()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAME"                           |"VERSION"  |"DATABASE_NAME"   |"SCHEMA_NAME"     |"CREATED_ON"                |"OWNER"   |"DESC"                                |"ENTITIES"    |"REFRESH_FREQ"  |"REFRESH_MODE"  |"SCHEDULING_STATE"  |"WAREHOUSE"   |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|CUSTOMER_BEHAVIOR_DATA_FEATURES  |V1         |SNOWPARK_DEMO_DB  |SIMPLE_FS_SCHEMA  |2024-09-12 02:29:00.600000  |SYSADMIN  |Ecommerce Customer Behavior Features  |[             |1 minute        |INCREMENTAL     |ACTIVE              |SIMPLE_ML_WH  |


In [88]:
behavior_fv.lineage('upstream')

[LineageNode(
   name='SNOWPARK_DEMO_DB.SIMPLE_ML_SCHEMA.CUSTOMER_BEHAVIOR_DATA',
   version='None',
   domain='table',
   status='ACTIVE',
   created_on='2024-09-11 14:51:18'
 )]