## 1 - Setup Demo

In [None]:
# Helper functions for this demo
from helper_functions.setup_environment import setup_demo
from helper_functions.plotting import extract_python_code, plot_inshop_vs_online_revenue, visualize_lineage
from helper_functions.mlops import get_feature_df, train_new_model, simulate_model_performance


# Import python packages
import plotly.express as px
import streamlit as st
import pandas as pd
import json

# Import Snowflake packages
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import functions as F
from snowflake.snowpark.functions import lit, col
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.metrics import mean_absolute_percentage_error
from snowflake.ml.registry import Registry
from snowflake.ml.monitoring.entities.model_monitor_config import ModelMonitorSourceConfig, ModelMonitorConfig
from snowflake.ml.feature_store import (
    FeatureStore,
    FeatureView,
    Entity,
    CreationMode
)
from snowflake.cortex import Complete

# Create a session
session = get_active_session()
setup_demo(session)

## 2 - Data Exploration & Visualization

In [None]:
transactions_df = session.table('SIMPLE_MLOPS_DEMO.RETAIL_DATA.TRANSACTIONS')

print(f'Number of transactions: {transactions_df.count()}')
print('Transactions Data:')
transactions_df.order_by(col('DATE').desc()).show()

print('Quick Variable Analysis:')
transactions_df.describe().order_by('SUMMARY').show()

### Plotting Data

In [None]:
model = 'mistral-large2'
prompt = f"""
I have a Snowpark Dataframe called transactions_df with the following columns: {transactions_df.columns}
Write code using Snowpark Python to aggregate the data showing the total monthly revenue (TOTAL_REVENUE) from all channels and month (MONTH).
Afterwards use the data to create a plotly bar chart to show total revenue per month. For the x-axis use dtick="M1".
Make sure to use the container-width for the plotly chart.
Only return the code to transform the dataframe and plot the data using Plotly in Streamlit.
"""
try:
    result = Complete(model, prompt)
    result = extract_python_code(result)
    exec(result)
except Exception as e:
    st.error(e)

In [None]:
# BACKUP
# Aggregate the data to show total monthly revenue
monthly_revenue_df = (
    transactions_df
    .with_column("MONTH", F.date_trunc("month", F.col("DATE")))
    .group_by("MONTH")
    .agg(F.sum("TRANSACTION_AMOUNT").as_("TOTAL_REVENUE"))
).to_pandas()

# Create a Plotly bar chart
fig = px.bar(
    monthly_revenue_df, 
    x="MONTH", 
    y="TOTAL_REVENUE", 
    title="Total Revenue per Month", 
    labels={"MONTH": "Month", "TOTAL_REVENUE": "Total Revenue"},
)

fig.update_xaxes(
    dtick="M1",
    tickformat="%b %Y"  # Format tick labels as "Jan 2023", adjust as needed
)

st.plotly_chart(fig, use_container_width=True)

In [None]:
plot_inshop_vs_online_revenue(transactions_df)

## 3 - Feature Store & Feature Engineering

### Setup the Feature Store

In [None]:
fs = FeatureStore(
    session=session, 
    database=session.get_current_database(), 
    name='FEATURE_STORE', 
    default_warehouse=session.get_current_warehouse(),
    creation_mode=CreationMode.CREATE_IF_NOT_EXIST,
)

### Create a Feature Store Entity "CUSTOMER"

In [None]:
# Create a new entity for the Feature Store
entity = Entity(name="CUSTOMER", join_keys=["CUSTOMER_ID"], desc='Unique identifier for customers.')
fs.register_entity(entity)
fs.list_entities().show()

### Add Transaction Features about Customers

In [None]:
def col_formatter(input_col, agg, window):
    feature_name = f"{agg.replace('SUM','TOTAL')}_{input_col}_{window.replace('-', 'past_').replace('MM','_MONTHS')}"
    return feature_name

in_shop_transaction_features = (
    transactions_df.filter(col('TRANSACTION_CHANNEL') == 'IN_SHOP')
    .group_by(['CUSTOMER_ID','DATE']).agg(F.sum('TRANSACTION_AMOUNT').as_('REVENUE'))
    .rename({'REVENUE':'REVENUE_IN_SHOP'})
    .analytics.time_series_agg(
        aggs={'REVENUE_IN_SHOP':['SUM']},
        windows=['-1MM','-2MM','-3MM'],
        sliding_interval="1D",
        group_by=['CUSTOMER_ID'],
        time_col='DATE',
        col_formatter=col_formatter
    ).drop(['SLIDING_POINT','REVENUE_IN_SHOP'])
)

online_transaction_features = (
    transactions_df.filter(col('TRANSACTION_CHANNEL') == 'ONLINE')
    .group_by(['CUSTOMER_ID','DATE']).agg(F.sum('TRANSACTION_AMOUNT').as_('REVENUE'))
    .rename({'REVENUE':'REVENUE_ONLINE'})
    .analytics.time_series_agg(
        aggs={'REVENUE_ONLINE':['SUM']},
        windows=['-1MM','-2MM','-3MM'],
        sliding_interval="1D",
        group_by=['CUSTOMER_ID'],
        time_col='DATE',
        col_formatter=col_formatter
    ).drop(['SLIDING_POINT','REVENUE_ONLINE'])
)

In [None]:
in_shop_transaction_features.filter(col('CUSTOMER_ID') == 1).order_by(col('DATE').desc()).show()

In [None]:
# Use LLM to generate feature descriptions
model = 'mistral-large2'

feature_columns = in_shop_transaction_features.drop('CUSTOMER_ID','DATE').columns
prompt = f'Return a JSON string with column names as keys and a short business description as values. The columns are: {feature_columns}. Do not wrap the json codes in JSON markers.'
llm_response = Complete(model, prompt, stream=False)
feature_descriptions_in_shop_transactions = json.loads(llm_response)

feature_columns = online_transaction_features.drop('CUSTOMER_ID','DATE').columns
prompt = f'Return a JSON string with column names as keys and a short business description as values. The columns are: {feature_columns}. Do not wrap the json codes in JSON markers.'
llm_response = Complete(model, prompt, stream=False)
feature_descriptions_online_transactions = json.loads(llm_response)

st.json(feature_descriptions_in_shop_transactions)
st.json(feature_descriptions_online_transactions)

In [None]:
# Create Feature View
in_shop_transaction_fv = FeatureView(
    name="IN_SHOP_REVENUE_FEATURES", 
    entities=[entity],
    timestamp_col='DATE',
    feature_df=in_shop_transaction_features, 
    refresh_freq="1 minute",
    refresh_mode='AUTO',
    desc="Features for in-shop transactions",
    overwrite=True
)

# Add descriptions for some features
in_shop_transaction_fv = in_shop_transaction_fv.attach_feature_desc(feature_descriptions_in_shop_transactions)

in_shop_transaction_fv = fs.register_feature_view(
    feature_view=in_shop_transaction_fv, 
    version="V1", 
    block=True,
    overwrite=True
)

# Create Feature View
online_transaction_fv = FeatureView(
    name="ONLINE_REVENUE_FEATURES", 
    entities=[entity],
    timestamp_col='DATE',
    feature_df=online_transaction_features, 
    refresh_freq="1 minute",
    refresh_mode='AUTO',
    desc="Features for online transactions",
    overwrite=True
)

# Add descriptions for some features
online_transaction_fv = online_transaction_fv.attach_feature_desc(feature_descriptions_online_transactions)

online_transaction_fv = fs.register_feature_view(
    feature_view=online_transaction_fv, 
    version="V1", 
    block=True,
    overwrite=True
)

## 4 - Model Training

### Generate the Training Dataset with Features from Feature Store

In [None]:
# Target: Predict total revenue per customer for October 2023
target_df = session.table('SIMPLE_MLOPS_DEMO.RETAIL_DATA.TRANSACTIONS')
target_df = (
    target_df.filter(col('DATE').between('2024-04-01','2024-04-30'))    # Generate Target Variable for April 2024
    .group_by('CUSTOMER_ID')
    .agg(F.sum('TRANSACTION_AMOUNT').as_('NEXT_MONTH_REVENUE'))
    .with_column('FEATURE_CUTOFF_DATE', F.to_date(lit('2024-03-31')))   # Features until End of March 2024
)

# Get list of all customers
customers_df = session.table('SIMPLE_MLOPS_DEMO.RETAIL_DATA.CUSTOMERS').select('CUSTOMER_ID').distinct()

# Create spine dataframe
spine_df = target_df.join(customers_df, on=['CUSTOMER_ID'], how='outer')
spine_df = spine_df.fillna(0, subset='NEXT_MONTH_REVENUE')
spine_df.order_by('CUSTOMER_ID').show()

In [None]:
train_dataset = fs.generate_dataset(
    name="SIMPLE_MLOPS_DEMO.FEATURE_STORE.NEXT_MONTH_REVENUE_DATASET",
    spine_df=spine_df,
    features=[in_shop_transaction_fv, online_transaction_fv],
    version="V1",
    spine_timestamp_col="FEATURE_CUTOFF_DATE",
    spine_label_cols=["NEXT_MONTH_REVENUE"],
    include_feature_view_timestamp_col=False,
    desc="Initial Training Dataset"
)

df = train_dataset.read.to_snowpark_dataframe()
df.show()

### Train an XGBoost Model

In [None]:
# Split the data into train and test sets
train_df, test_df = df.random_split(weights=[0.9, 0.1], seed=0)

print(f'Number of samples in train: {train_df.count()}')
print(f'Number of samples in test: {test_df.count()}')

feature_columns = train_df.drop(['CUSTOMER_ID','FEATURE_CUTOFF_DATE','NEXT_MONTH_REVENUE']).columns

xgb_model = XGBRegressor(
    input_cols=feature_columns,
    label_cols=['NEXT_MONTH_REVENUE'],
    output_cols=['NEXT_MONTH_REVENUE_PREDICTION'],
    n_estimators=100,
    learning_rate=0.05,
    random_state=0
)

xgb_model = xgb_model.fit(train_df)

### Evaluate the XGBoost Model

In [None]:
predictions = xgb_model.predict(test_df)
# Analyze results
mape = mean_absolute_percentage_error(
    df=predictions, 
    y_true_col_names="NEXT_MONTH_REVENUE", 
    y_pred_col_names="NEXT_MONTH_REVENUE_PREDICTION"
)

print(f"Mean absolute percentage error: {mape}")

col1, col2 = st.columns(2)
with col1:
    # Plot Feature Importance
    plot_data = pd.DataFrame(
        list(zip(feature_columns, xgb_model.to_xgboost().feature_importances_)), 
        columns=['FEATURE','IMPORTANCE']
    )
    
    fig = px.bar(
        plot_data.sort_values('IMPORTANCE', ascending=False).head(10),
        x="IMPORTANCE",
        y="FEATURE",
        title="Feature Importance",
        labels={"FEATURE": "Feature", "IMPORTANCE": "Importance"},
        orientation="h"
    )
    st.plotly_chart(fig, use_container_width=True)
with col2:
    # Plot Predictions
    fig = px.scatter(
        predictions["NEXT_MONTH_REVENUE", "NEXT_MONTH_REVENUE_PREDICTION"].to_pandas().astype("float64"),
        x="NEXT_MONTH_REVENUE",
        y="NEXT_MONTH_REVENUE_PREDICTION",
        title="Actual vs Predicted Revenue",
        labels={
            "NEXT_MONTH_REVENUE": "Actual Revenue",
            "NEXT_MONTH_REVENUE_PREDICTION": "Predicted Revenue"
        },
        trendline="ols",
        trendline_color_override="red"
    )
    st.plotly_chart(fig, use_container_width=True)

In [None]:
# Save baseline predictions
predictions = predictions.with_column('FEATURE_CUTOFF_DATE', F.col('FEATURE_CUTOFF_DATE').cast('timestamp'))
predictions = predictions.with_column('NEXT_MONTH_REVENUE_PREDICTION', F.col('NEXT_MONTH_REVENUE_PREDICTION').cast('number(38,2)'))
predictions = predictions.with_column('NEXT_MONTH_REVENUE', F.col('NEXT_MONTH_REVENUE').cast('number(38,2)'))
predictions.write.save_as_table('SIMPLE_MLOPS_DEMO.MODEL_REGISTRY.MM_REVENUE_BASELINE_V1', mode='overwrite')

## 5 - Snowflake Model Registry
### Setup Model Registry

In [None]:
# Create reference to model registry
reg = Registry(
    session=session, 
    database_name=session.get_current_database(), 
    schema_name='MODEL_REGISTRY', 
    options={'enable_monitoring':True},
)

### Register Model in Model Registry

In [None]:
registered_model = reg.log_model(
    xgb_model,
    model_name="CUSTOMER_REVENUE_MODEL",
    version_name='V1',
    metrics={
        'MAPE':mape, 
        'FEATURE_IMPORTANCE':dict(zip(feature_columns, xgb_model.to_xgboost().feature_importances_.astype('float'))),
        "TRAINING_DATA":{'FEATURE_CUTOFF_DATE':'2024-03-31'}
    },
    comment="Model trained using XGBoost to predict revenue per customer for next month.",
    conda_dependencies=['xgboost'],
    sample_input_data=train_df.select(feature_columns).limit(10),
    options={"relax_version": False, "enable_explainability": True}
)

In [None]:
# Set this model version as PRODUCTION
registered_model.set_alias('PRODUCTION')

In [None]:
explanations = registered_model.run(test_df, function_name="explain")
explanations = explanations.rename({col:col.replace('"""', '').upper() for col in explanations.columns})
explanations = explanations.select([col for col in explanations.columns if '_EXPLANATION' in col])
explanations = explanations.to_pandas()

import shap
shap_exp = shap._explanation.Explanation(explanations.values, feature_names = explanations.columns) # wrapping them into a SHAP recognized object
shap.plots.bar(shap_exp)

In [None]:
trace = session.lineage.trace(
    object_name='SIMPLE_MLOPS_DEMO.MODEL_REGISTRY.CUSTOMER_REVENUE_MODEL',
    object_version='V1',
    object_domain='model',
    direction='both',
    distance=2
)
trace.show()

In [None]:
visualize_lineage(trace.to_pandas(), short_names=True)

### Continious Model Monitoring

In [None]:
feature_df = get_feature_df(session, feature_cutoff_date='2024-04-30')
feature_df.show()

# Predict May values
predictions = registered_model.run(feature_df, function_name='PREDICT')
predictions = predictions.with_column('FEATURE_CUTOFF_DATE', F.col('FEATURE_CUTOFF_DATE').cast('timestamp'))
predictions = predictions.with_column('NEXT_MONTH_REVENUE_PREDICTION', F.col('NEXT_MONTH_REVENUE_PREDICTION').cast('number(38,2)'))
predictions.write.save_as_table(table_name='SIMPLE_MLOPS_DEMO.MODEL_REGISTRY.MM_TRANS_SOURCE_V1', mode='overwrite')

### Create a Model Monitor

In [None]:
# Enable once 1.7.3 with bugfix is available
# source_config = ModelMonitorSourceConfig(
#     source='MLOPS_DEMO.MODEL_REGISTRY.MM_TRANS_SOURCE',
#     timestamp_column='FEATURE_CUTOFF_DATE',
#     id_columns=['CUSTOMER_ID'],
#     prediction_score_columns=['NEXT_MONTH_REVENUE_PREDICTION'],
#     actual_score_columns=['NEXT_MONTH_REVENUE'],
#     baseline='MLOPS_DEMO.MODEL_REGISTRY.MM_REVENUE_BASELINE_V1'
# )

# monitor_config = ModelMonitorConfig(
#     model_version=reg.get_model('CUSTOMER_REVENUE_MODEL').version('PRODUCTION'),
#     model_function_name='predict',
#     background_compute_warehouse_name='COMPUTE_WH',
#     refresh_interval='1 minute',
#     aggregation_window='1 day'
# )

# reg.add_monitor(
#     name='MLOPS_DEMO.MODEL_REGISTRY.MM_V1',
#     source_config=source_config,
#     model_monitor_config=monitor_config
# )

In [None]:
CREATE OR REPLACE MODEL MONITOR SIMPLE_MLOPS_DEMO.MODEL_REGISTRY.MM_V1 WITH
    MODEL=SIMPLE_MLOPS_DEMO.MODEL_REGISTRY.CUSTOMER_REVENUE_MODEL VERSION=V1 FUNCTION=PREDICT
    SOURCE=SIMPLE_MLOPS_DEMO.MODEL_REGISTRY.MM_TRANS_SOURCE_V1
    BASELINE=SIMPLE_MLOPS_DEMO.MODEL_REGISTRY.MM_REVENUE_BASELINE_V1,
    TIMESTAMP_COLUMN='FEATURE_CUTOFF_DATE'
    ID_COLUMNS=('CUSTOMER_ID')
    PREDICTION_SCORE_COLUMNS=('NEXT_MONTH_REVENUE_PREDICTION')
    ACTUAL_SCORE_COLUMNS=('NEXT_MONTH_REVENUE')
    WAREHOUSE=COMPUTE_WH
    REFRESH_INTERVAL='1 minute'
    AGGREGATION_WINDOW='1 day'

In [None]:
# Add new transactions
new_transactions = session.table('SIMPLE_MLOPS_DEMO._DATA_GENERATION._TRANSACTIONS').filter(col('DATE').between('2024-05-01','2024-05-31'))
new_transactions.write.save_as_table(table_name='SIMPLE_MLOPS_DEMO.RETAIL_DATA.TRANSACTIONS', mode='append')

# Calculate actual values
actual_values_df = (
    session.table('SIMPLE_MLOPS_DEMO.RETAIL_DATA.TRANSACTIONS')
    .filter(col('DATE').between('2024-05-01','2024-05-31'))
    .group_by(['CUSTOMER_ID'])
    .agg(F.sum('TRANSACTION_AMOUNT').as_('TOTAL_REVENUE'))
    .with_column('DATE', F.to_date(lit('2024-04-30')))
)

# Get list of all customers
customers_df = session.table('SIMPLE_MLOPS_DEMO.RETAIL_DATA.CUSTOMERS').select('CUSTOMER_ID').distinct()

# Assume 0 revenue for customers without transactions
actual_values_df = actual_values_df.join(customers_df, on=['CUSTOMER_ID'], how='outer')
actual_values_df = actual_values_df.fillna(0,subset='TOTAL_REVENUE')

# Update source table from model monitor
source_table = session.table('SIMPLE_MLOPS_DEMO.MODEL_REGISTRY.MM_TRANS_SOURCE_V1')
source_table.update(
    condition=(
        (source_table['FEATURE_CUTOFF_DATE'] == actual_values_df['DATE']) &
        (source_table['CUSTOMER_ID'] == actual_values_df['CUSTOMER_ID'])
    ),
    assignments={
        "NEXT_MONTH_REVENUE": actual_values_df['TOTAL_REVENUE'],
    },
    source=actual_values_df
)

## Simulate the rest of the year

In [None]:
start_date = '2024-06-01'
end_date = '2025-01-31'
model_version = 'V1'

simulate_model_performance(session, start_date, end_date, model_version, generate_data=True)

## Explore the Model Monitor
Navigate to the Model Monitor and observe the `MAPE` and `Difference of means`  for the last months.  

You will notice the following:
* Declining Model Performance
    * :arrow_up_small: MAPE (Mean Average Percentage Error)
* Feature Drift
    * :arrow_down_small: Difference of means for TOTAL_REVENUE_IN_SHOP_PAST_1_MONTHS (less in shop transaction volume)
    * :arrow_up_small: Difference of means for TOTAL_REVENUE_ONLINE_PAST_1_MONTHS (more online transaction volume)

If we visualize the monthly revenue distribution, we can see that online revenue grew while in-shop transaction declined.

In [None]:
plot_inshop_vs_online_revenue(transactions_df)

## Train a new version
Given that the user behavior changed, we'll train a new version of our model with fresh data.

In [None]:
feature_cutoff_date = '2024-08-31'
target_start_date = '2024-09-01'
target_end_date = '2024-09-30'
model_version = 'V2'

train_new_model(session, feature_cutoff_date, target_start_date, target_end_date, model_version)

In [None]:
compare_two_models(session,'V1','V2')

In [None]:
start_date = '2024-10-01'
end_date = '2025-01-31'
model_version = 'V2'

simulate_model_performance(session, start_date, end_date, model_version, generate_data=False)