Before running this noetbook makes sure that the following packages are installed in your Python enviroment:

* `matplotlib`
* `seaborn`
* `networkx`
* `snowflake-snowpark-python`
* `snowflake-ml-python`
* `snowflake`

You also need to make sure the file `plotting.py` is in the same directory as this notebook.

In [1]:
# Import python packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from snowflake.ml.feature_store import (
    FeatureStore,
    CreationMode)

from snowflake.ml.modeling.impute import SimpleImputer
from snowflake.ml.modeling.preprocessing import OrdinalEncoder, OneHotEncoder
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.modeling.metrics import mean_absolute_percentage_error
from snowflake.ml.registry import Registry

from snowflake.snowpark import Session
import snowflake.snowpark.functions as snow_funcs

from snowflake.core import Root
from snowflake.core.warehouse import Warehouse

# Import a function from a python file on a stage
from plotting import plot_lineage

# Get the Snowpark session
from snowflake.snowpark import Session

### Connect to Snowflake

This example is using the connections.toml file to connect to Snowflake. You can read more at https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-connect#connecting-using-the-connections-toml-file how to set it up.

In [2]:
db_name = "SNOWPARK_DEMO_DB"
schema_name = "SIMPLE_ML_SCHEMA"
fs_schema_name = "SIMPLE_FS_SCHEMA"
mr_schema_name = "SIMPLE_MR_SCHEMA"
wh_name = "SIMPLE_ML_WH"

CONNECTION_NAME = '<YOUR CONNECTION NAME>' # Name of the connection in connections.toml to be used to connect to Snowflake

session = Session.builder.config("connection_name", CONNECTION_NAME).create()

session.use_schema(f'{db_name}.{schema_name}')
session.use_warehouse(wh_name)

In [3]:
# Connect to Feature Store
fs = FeatureStore(
    session=session, 
    database=db_name, 
    name=fs_schema_name, 
    default_warehouse=wh_name,
    creation_mode=CreationMode.FAIL_IF_NOT_EXIST,
)

### Generate a Training Dataset

In [None]:
fs.list_entities().show()

In [None]:
fs.list_feature_views().filter(snow_funcs.array_contains(snow_funcs.to_variant(snow_funcs.lit('CUSTOMER')), snow_funcs.col("ENTITIES"))).show()

In [6]:
cust_fv = fs.get_feature_view(name="CUSTOMER_GENERAL_DATA_FEATURES",
                                version="V1")
behavior_fv = fs.get_feature_view(name="CUSTOMER_BEHAVIOR_DATA_FEATURES",
                                version="V1")

Create a spine dataframe that has the EMAIL's of all customers we want to get features for, it also have our target column for the model training.

In [None]:
print('Customer Life Time Value Data:')
ltv_df = session.table(f'{db_name}.{schema_name}.CUSTOMER_LIFE_TIME_VALUE').drop('YEAR_MONTH')
ltv_df.show(n=5)

Generate a Dataset with features for all customers in ltv_df

In [None]:
registered_dataset = fs.generate_dataset(
    name="ECOMMERCE_CUSTOMER_FEATURES",
    spine_df=ltv_df,
    features=[cust_fv,behavior_fv],
    spine_label_cols=["LIFE_TIME_VALUE"],
    desc="Training Data to train model to predict Customer Life Time Value."
)


# Retrieve a Snowpark DataFrame from the registered Dataset
# We can also get a Pandas DataFrame (to_pandas) or a TensorFlow tf.data.Dataset or a Pytorch datapipe or a PyTorch Iterable Dataset
registered_dataset_df = registered_dataset.read.to_snowpark_dataframe()
registered_dataset_df.limit(5).show()

#training_set = fs.generate_training_set(
#    spine_df=ltv_df,
#    features=[cust_fv,behavior_fv],
#    spine_label_cols=["LIFE_TIME_VALUE"],      # optional
#)
#training_set.limit(5)

Create a pipline that impute, encode and train a XGBRegressor using GridSearchCV.

The GridSearchCV will run in a distributed mode withinh Snowflake, the large the Warehouse used the more distributed it will be

In [9]:
# Split the data into train and test sets
train_df, test_df = registered_dataset_df.random_split(weights=[0.9, 0.1], seed=0)
train_df.count(), test_df.count()

# Drop the Email column for Training
train_df = train_df.drop('EMAIL')

# Define sklearn-like Imputers
si_numeric =  SimpleImputer(
    input_cols=['AVG_SESSION_LENGTH_MIN','AVG_TIME_ON_APP_MIN','AVG_TIME_ON_WEBSITE_MIN'], 
    output_cols=['AVG_SESSION_LENGTH_MIN_IMP','AVG_TIME_ON_APP_MIN_IMP','AVG_TIME_ON_WEBSITE_MIN_IMP'],
    strategy='mean',
    drop_input_cols=False
)

# Define sklearn-like Encoders
categories = {
    "MEMBERSHIP_STATUS": np.array(["BASIC", "BRONZE", "SILVER", "GOLD", "PLATIN", "DIAMOND"]),
}
oe_categorical = OrdinalEncoder(
    input_cols=["MEMBERSHIP_STATUS"], 
    output_cols=["MEMBERSHIP_STATUS_OE"], 
    categories=categories,
    drop_input_cols=False
)

ohe_categorical = OneHotEncoder(
    input_cols=["GENDER"], 
    output_cols=["GENDER_OHE"],
    drop_input_cols=False
)

# Define the XGBoost model (incl. Hyperparameter Tuning)
feature_cols = [
    'GENDER_OHE_FEMALE',
    'GENDER_OHE_MALE',
    'MEMBERSHIP_STATUS_OE',
    'MEMBERSHIP_LENGTH_DAYS',
    'AVG_SESSION_LENGTH_MIN_IMP',
    'AVG_TIME_ON_APP_MIN_IMP',
    'AVG_TIME_ON_WEBSITE_MIN_IMP',
    'APP_PRIMARY'
]
label_cols = ['LIFE_TIME_VALUE']
output_cols = ['LIFE_TIME_VALUE_PREDICTION']

grid_search = GridSearchCV(
    estimator=XGBRegressor(),
    param_grid={
        "n_estimators":[100, 200, 300, 400],
        "learning_rate":[0.1, 0.2, 0.3],
    },
    n_jobs = -1,
    scoring="neg_mean_absolute_percentage_error",
    input_cols=feature_cols,
    label_cols=label_cols,
    output_cols=output_cols
)

# Build the pipeline
model_pipeline = Pipeline(
    steps=[
        ("SI_NUMERIC",si_numeric),
        ("OE_CATEGORICAL",oe_categorical),
        ("OHE_CATEGORICAL",ohe_categorical),
        ("GRIDSEARCH_XGBOOST",grid_search)
    ]
)


Scale up the Warehouse using the Python API

In [10]:
# Scale up our WH
root = Root(session)

ml_wh = Warehouse(
  name=wh_name,
  warehouse_size="XXLARGE",
  wait_for_completion = "true",
)

ml_wh_res = root.warehouses[wh_name]

ml_wh_res.create_or_alter(ml_wh)

Run the training.

In [None]:
# Fit the pipeline to the training data
fitted_pipeline = model_pipeline.fit(train_df)

Scale down the Warehouse since we no longer need it as large

In [12]:
ml_wh = Warehouse(
  name=wh_name,
  warehouse_size="SMALL",
  wait_for_completion = "true",
)

ml_wh_res = root.warehouses[wh_name]

ml_wh_res.create_or_alter(ml_wh)

# Model Evaluation

In [None]:
# Analyze grid search results
gs_results = fitted_pipeline.to_sklearn().named_steps['GRIDSEARCH_XGBOOST'].cv_results_
n_estimators_val = []
learning_rate_val = []
for param_dict in gs_results["params"]:
    n_estimators_val.append(param_dict["n_estimators"])
    learning_rate_val.append(param_dict["learning_rate"])
mape_val = gs_results["mean_test_score"]*-1

gs_results_df = pd.DataFrame(data={
    "n_estimators":n_estimators_val,
    "learning_rate":learning_rate_val,
    "mape":mape_val})

sns.set_context("notebook", font_scale=0.5)
sns.relplot(data=gs_results_df, x="learning_rate", y="mape", hue="n_estimators", kind="line", height=3)

plt.show()

Caluclate MEAPE

In [None]:
# Predict LTV for customers in test data
predictions = fitted_pipeline.predict(test_df).cache_result()

# Analyze results
mape = mean_absolute_percentage_error(df=predictions, 
                                        y_true_col_names="LIFE_TIME_VALUE", 
                                        y_pred_col_names="LIFE_TIME_VALUE_PREDICTION")


# Plot actual vs predicted 
g = sns.relplot(
    data=predictions["LIFE_TIME_VALUE", "LIFE_TIME_VALUE_PREDICTION"].to_pandas().astype("float64"), 
    x="LIFE_TIME_VALUE", 
    y="LIFE_TIME_VALUE_PREDICTION", 
    kind="scatter",
    height=3)
g.ax.axline((0,0), slope=1, color="r")

print(f"Mean absolute percentage error: {mape}")
predictions.select("LIFE_TIME_VALUE", "LIFE_TIME_VALUE_PREDICTION").limit(15).show()
plt.show()

# Log model to Model Registry

In [None]:
# Create reference to model registry
ml_reg = Registry(session=session, database_name=db_name, schema_name=mr_schema_name)
ml_reg.show_models()

In [17]:

# Register new model version
registered_model = ml_reg.log_model(
    fitted_pipeline,
    model_name="CUSTOMER_LTV_MODEL",
    version_name='MY_FIRST_MODEL_VERSION',
    comment="Model trained using GridsearchCV in Snowpark to predict Customer Life Time Value",
    metrics={"mean_abs_pct_err": mape},
    conda_dependencies=['xgboost'],
    options={'relax_version': False}
)

In [None]:
ml_reg.show_models()

In [None]:
registered_model.lineage('upstream')

In [None]:

df = session.lineage.trace(f"{db_name}.{mr_schema_name}.CUSTOMER_LTV_MODEL", "MODEL", object_version='MY_FIRST_MODEL_VERSION', direction='upstream', distance=3)
plot_lineage(df)