# Examples of loging non Snowpark ML models into the Snowpark Model Registry

In [None]:
# Import needed Snowflake modules
from snowflake.snowpark import Session
from snowflake.ml.registry import Registry

# Common modules for all examples
import pandas as pd
import numpy as np

### Connect to Snowflake

This example is using the connections.toml file to connect to Snowflake. You can read more at https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-connect#connecting-using-the-connections-toml-file how to set it up.

In [None]:
CONNECTION_NAME = 'MY SNOWFLAKE CONNECTION' # Name of the connection in connections.toml to be used to connect to Snowflake
DATABASE_NAME = 'SNOWPARK_DEMO_DB' # Database to use for data
DATABASE_SCHEMA = 'SOURCE_DATA' # Name of schema to store data in and where wource data is
FULLY_QUALIFIED_NAME = f"{DATABASE_NAME}.{DATABASE_SCHEMA}"

snf_session = Session.builder.config("connection_name", CONNECTION_NAME).create()
snf_session.use_schema(FULLY_QUALIFIED_NAME)
snf_session.get_fully_qualified_current_schema()

In [None]:
# 
snowml_registry = Registry(snf_session)

## Scikit-Learn

Train a RandomForestRegressor model within a pipline and log the fitted pipeline into the Snowpark Model Registry

In [None]:
# SKLearn Imports
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline



Get some data to use for training

In [None]:

# Use the diamond dataset
data = pd.read_csv("https://raw.githubusercontent.com/tidyverse/ggplot2/882584f915b23cda5091fb69e88f19e8200811bf/data-raw/diamonds.csv", sep=',')

# rename table to table_pct so we do not have any issues with selecting teh column when using data in snowflake
data.rename(columns={'table': 'table_pct'}, inplace=True)

data.head()


Define categorical and numerical columns, create X and y datastest and split them into train and test

In [None]:
CATEGORICAL_COLUMNS = ["cut", "color", "clarity"]
NUMERICAL_COLUMNS = ["carat", "depth", "table_pct", "x", "y", "z"]
X = data.drop(["price"], axis=1)
y = data.price

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=123)

Create a Pipeline that do preprocessing and then fit a RandomForestRegressor

In [None]:

categories = [
    np.array(["Ideal", "Premium", "Very Good", "Good", "Fair"]), # cut
    np.array(['D', 'E', 'F', 'G', 'H', 'I', 'J']), # color
    np.array(["IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1", "I2", "I3"]), # clarity
]

cat_transformer = Pipeline(steps=[
        ('oe', OrdinalEncoder(categories=categories))
    ])

num_transformer = Pipeline(steps=[
        ('scaler', MinMaxScaler(clip=True))
    ])

# Combine into a column transformer
preprocessor = ColumnTransformer(
    [
        ('num', num_transformer, NUMERICAL_COLUMNS),
        ('cat', cat_transformer, CATEGORICAL_COLUMNS),
    ],  verbose_feature_names_out=False,
)


# Create a pipeline with the column transformer and training of a Random Forrest Classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestRegressor(n_jobs=-1))])

# Train
pipeline.fit(X_train, y_train)

Calculate the MAPE on the test data

In [None]:
# Eval
y_pred = pipeline.predict(X_test)

mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"Mean absolute percentage error: {mape}")

Every model saved in the Snowpark Model Registry needs a unique name within the schema it is saved in, a model name can have multiple versions where each version needs a unique name (within the model).

In [None]:
skl_model_name = "skl_diamonds"
skl_version_name = 'V1'


Log the SKLearn pipeline

In [None]:
skl_mv = snowml_registry.log_model(model=pipeline, 
                                   model_name = skl_model_name, 
                                   version_name = skl_version_name,
                                   sample_input_data = X_train.head(),
                                   metrics = {"test_mape": mape},
                                   comment='SKLearn pipline'
                                )


We can check what functions we have avalible by using **show_functions()**

In [None]:
skl_mv.show_functions()

Create a Snowpark DataFrame to test the deployed model

In [None]:
snf_test_df = snf_session.create_dataframe([[0.23, 'Ideal', 'E', 'SI2', 61.5, 55.0, 3.95, 3.98, 2.43]]
                                           , schema=['carat', 'cut', 'color', 'clarity', 'depth', 'table_pct', 'x', 'y', 'z'])
snf_test_df.show()

USe the model on the Snowpark DataFrame

In [None]:
skl_mv.run(snf_test_df).show()

# Using CustomModel
https://docs.snowflake.com/en/developer-guide/snowpark-ml/reference/latest/model#snowflake-ml-model-custom-model

In [None]:
# Needed additional Snowpark ML modules
from snowflake.ml.model import custom_model
from snowflake.ml.model import model_signature

import os
import shutil

## PyCaret

In [None]:
# Needed PyCaret modules
from pycaret.classification import ClassificationExperiment, predict_model, load_model
from pycaret.datasets import get_data

Start by running a Classification Experiment using the Juice dataset

In [None]:
data = get_data('juice')

cl_exp = ClassificationExperiment()
cl_exp.setup(data, target='Purchase', session_id=123)
best_model = cl_exp.compare_models()

In order to log the model/experiment into the Snowpark Model Registry we need to 
1) Serilize the model into a file
2) Create a CustomModel class

Start by saving the model as afile using the *save_model* method

In [None]:
# Save model as file
cl_exp.save_model(best_model, "juice_best_model")

We need to set up a file structure to be used for the CustomModel and to move the saved file into it.

In [None]:

ARTIFACTS_DIR = "/tmp/pycaret/"
# Create the directory where we will move the file
os.makedirs(os.path.join(ARTIFACTS_DIR, "model"), exist_ok=True)
# Move the saved model into the directory
shutil.move('juice_best_model.pkl', os.path.join(ARTIFACTS_DIR, 'model',  'juice_best_model.pkl'))

Next step is to create a CustomModel class that will be used in Snowflake when calling the methods/functions of the model. In this case we will only support the *predict* function, but if wanted to support addtional functions we would specifiy those as methonds of our class

In [None]:
# Name of the class
class PyCaretModel(custom_model.CustomModel):
    # The init function is used to load the model file
    def __init__(self, context: custom_model.ModelContext) -> None:
        super().__init__(context)
        # The model is saved with .pkl prefix, and the filename will be part of the properties of the ModelContext
        # we craete when logging it to Snowflake. Since PyCaret load function does not support using the prefix we 
        # need to remove it from the name
        model_dir = self.context.path("model_file")[:-4]
        # Load the model
        self.model = load_model(model_dir, verbose=False)
        # When running this model in Snowflake it will use a WH and we do not have access to /var/ on the nodes so
        # we need to change to a directory we have access to in this case /tmp/
        self.model.memory='/tmp/' 

    @custom_model.inference_api
    def predict(self, X: pd.DataFrame) -> pd.DataFrame:
        model_output = predict_model(self.model, data=X)
        res_df = pd.DataFrame({"prediction_label": model_output['prediction_label'], "prediction_score": model_output['prediction_score']})
        
        return res_df

We can now use this CustomModel class every time we want to log a PyCaret ClassificationExperiment to Snowflake.

Before logging the model we need to define the ModelContext, that will point to the artifatcs, file, needed when using the model in Snowflake.

In [None]:
pycaret_mc = custom_model.ModelContext(
	models={ # This should be for models that is supported by Model Registry
	},
	artifacts={ # Everything not supported needs to be here
		'model_file': os.path.join(ARTIFACTS_DIR, "model",  'juice_best_model.pkl'),
	}
)


We can now create a new Model object and test that with some data, we will save the predictions into a Pandas DataFrame so we can use it later when generating a model signature

In [None]:
my_pycaret_model = PyCaretModel(pycaret_mc)

new_data = data.copy().drop('Purchase', axis=1)

output_pd = my_pycaret_model.predict(new_data)
output_pd

Every model saved in the Snowpark Model Registry needs a unique name within the schema it is saved in, a model name can have multiple versions where each version needs a unique name (within the model).

In [None]:
model_name = "pycaret_juice"
version_name = "v1"

Before logging teh model we need to provide a Model Signauture. A Model Signature can be created using sample data for the input and output and we can use the *model_signature.infer_signature* function to generate it from the 
data.

In thsi case we can use the **new_data** Pandas DataFrame as the input_data and **output_pd** Pandas DataFrame as the output.

In [None]:
# Need to create signature since 
predict_sign = model_signature.infer_signature(input_data=new_data.sample(100), output_data=output_pd.sample(100))
predict_sign

We can now log the model,  we will use the moel signature for the predict function

In [None]:
custom_mv = snowml_registry.log_model(
    my_pycaret_model,
    model_name=model_name,
    version_name=version_name,
    conda_dependencies=["pycaret"],
    signatures={"predict": predict_sign},
    comment = 'PyCaret ClassificationExperiment using the CustomModel API'
)

In [None]:
new_data_snowflake = snf_session.write_pandas(new_data,"pycaret_input_data", auto_create_table=True, overwrite=True, quote_identifiers=False )
new_data_snowflake.show()

In [None]:
custom_mv.run(new_data_snowflake, function_name='predict').show()

```
SELECT 
 pycaret_juice!predict(*) as predict_dict,
 predict_dict['prediction_label']::text as prediction_label,
 predict_dict['prediction_score']::double as prediction_score
from pycaret_input_data;
```