# Demo of Snowpark end-to-end Machine Learning


The dataset is from https://archive-beta.ics.uci.edu/dataset/222/bank+marketing


**Run 00_Load_demo_data.ipynb to upload the Parquet files used for this Notebook**

In [None]:
# Imports 
import snowflake.snowpark as S
from snowflake.snowpark import Session
from snowflake.snowpark import functions as F
from snowflake.snowpark import types as T
from snowflake.snowpark import Window

import joblib
import cachetools
import io
import os

import json

# Make sure we do not get line breaks when doing show on wide dataframes
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

import sqlparse

# Print the version of Snowpark we are using
print(f"Using Snowpark: {S.__version__}")

Helper functions for nicer printing of Snowparkd dataframe schema, SQL and to generate a correlation matrix.

In [None]:
# Helper functions for nicer printing
def print_sql(df):
    for query in df.queries['queries']:
        print(sqlparse.format(query, reindent=True))

def print_schema(df):
    print("schema:")
    for col in df.schema.fields:
        print(f" |-- {col.name}: {col.datatype} (Nullable: {col.nullable})")

def shape(df):
    return (df.count(), len(df.columns))

Connect to Snowflake

This example is using a JSON file with the following structure
```
{
    "account":"MY SNOWFLAKE ACCOUNT",
    "user": "MY USER",
    "password":"MY PASSWORD",
    "role":"MY ROLE",
    "warehouse":"MY WH",
    "database":"MY DB",
    "schema":"MY SCHEMA"
}

```

In [None]:
with open('../creds.json') as f:
    connection_parameters = json.load(f)

session = Session.builder.configs(connection_parameters).create()
print("Current role: " + session.get_current_role() + ", Current schema: " + session.get_fully_qualified_current_schema() + ", Current WH: " + session.get_current_warehouse())

In [None]:
# Parameters
source_path = "@SOURCE_FILES/BANK_MARKETING" # Where the source parquet files are stored
sp_udf_stage = "BANK_STAGE" # Name of the stage to used for storing the code for the SP and UDF , as well the trained model files

## Loading of source data
### Loading Parquet files with inferring the schema.

Start to check that the source files is on the stage

In [None]:
session.sql(f"ls {source_path}").select('"name"').show(30, max_width=150)

Take a peak in the files

In [None]:
session.sql("create or replace temp file format parq1 type='PARQUET'").collect()
session.sql(f"select $1 from {source_path} (file_format=>parq1 )").show(2)

Loading Parquet files with inferring the schema.

In [None]:
df_reader = session.read.parquet(source_path)
df_reader.show()

The df_reader datafarme is reading the files from stage when used, we can check this by looking at the SQL it generates

In [None]:
print_sql(df_reader)

To load the data into a Snowflake table we can use copy_into_table.

It will create the table if it not exists, using the infered schema, and if the table exists it will append the data. However, Snowflake keeps track of what files it has loaded so it does not load the same file twice, by dropping the table we ensure that the files are loaded

In [None]:
session.sql("DROP TABLE IF EXISTS bank_marketing_v2").collect()
df_reader.copy_into_table("bank_marketing_v2")

## Data exploration

Create a Snowpark Dataframe using the new table

In [None]:
df_bank_marketing = session.table("bank_marketing_v2")
display(f"Dataframe shape: {shape(df_bank_marketing)}")
df_bank_marketing.show()

In [None]:
print_sql(df_bank_marketing)

### Data understanding

Start with verifying datatypes, simple put we will treat charcter columns as categorical

In [None]:
print_schema(df_bank_marketing)

DAY is stored as a number but can be threaded as categorical, fixed number of days in months, and by changing the data type to character we will do that.

In [None]:
df_bank_marketing_prep = df_bank_marketing.with_column("DAY", F.to_varchar(F.col("DAY")))
print_schema(df_bank_marketing_prep)

Get basic statistics about the categorical and numeric columns

In [None]:
df_bank_marketing_prep.describe().show()

Create variables with our categorical, numeric and target columns names so we can use them with encoders and scalers

In [None]:
cat_cols = [c.name for c in df_bank_marketing_prep.schema.fields if (type(c.datatype) == T.StringType) & (c.name != 'Y')]
numeric_types = [T.DecimalType, T.LongType, T.DoubleType, T.FloatType, T.IntegerType]
num_cols = [c.name for c in df_bank_marketing_prep.schema.fields if type(c.datatype) in numeric_types]
target_col = "Y"

Distribution of target values

In [None]:
df_bank_marketing_prep.group_by(target_col).count().show()

Frequency tables for each categorical feature

In [None]:
for col in cat_cols:
    display(df_bank_marketing_prep.select(F.count_distinct(col).as_(f"{col} distinct values")).show())
    display(df_bank_marketing_prep.group_by(col).count()\
                                .select(col, (F.call_function("RATIO_TO_REPORT", F.col("COUNT")).over() * 100).as_("% observations") )\
                                .sort(F.col("% observations").desc()).show(31))

Relationship between each of the categorical features and the target column

In [None]:
for col in cat_cols:
    window = Window.partition_by(col)
    display(df_bank_marketing_prep.group_by(col, F.col(target_col))\
                                .count()\
                                .select(col, F.col(target_col), (F.call_function("RATIO_TO_REPORT", F.col("COUNT")).over(window) * 100).as_("percentage"))\
                                .pivot(target_col, ['no', 'yes']).agg(F.sum("percentage")).show(50))


### Model training

In [None]:
def train_response_model(X, y, cat_cols, num_cols):
    
    # One Hot Encoder transformer for categorical columns
    cat_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    # Standard scaler for numerical columns
    num_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    
    # Combine into a column transformer
    preprocessor = ColumnTransformer(
      [
            ('num', num_transformer, num_cols),
            ('cat', cat_transformer, cat_cols),
        ],  verbose_feature_names_out=False,
    )
    
    # Create a pipeline with the column transformer and training of a Random Forrest Classifier
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(n_jobs=-1))])
    rfc_model = pipe.fit(X, y)
    return rfc_model

In [None]:
pd_train = df_bank_marketing.sample(frac=0.10).to_pandas()

X = pd_train[[*cat_cols, *num_cols]]
y = pd_train["Y"]
model = train_response_model(X, y, cat_cols, num_cols)

In [None]:
model

Test the model

In [None]:
def save_file_to_stage(snf_session, obj, file_name, stage_path):
    import io
    import joblib
    
    file_path = stage_path + file_name
    
    input_stream = io.BytesIO()
    joblib.dump(obj, input_stream)
    snf_session._conn._cursor.upload_stream(input_stream, file_path)
    
    return file_path

def train_bank(snf_session: Session, stage: str) -> dict:
    from datetime import datetime
    
    df_train, df_test = snf_session.table("bank_marketing_v2").random_split([0.8, 0.2])
    
    # Get the categorical, numerical and target column
    cat_cols = [c.name for c in df_train.schema.fields if (type(c.datatype) == T.StringType) & (c.name != 'Y')]
    numeric_types = [T.DecimalType, T.LongType, T.DoubleType, T.FloatType, T.IntegerType]
    num_cols = [c.name for c in df_train.schema.fields if type(c.datatype) in numeric_types]
    target_col = "Y"
    X_cols = [*cat_cols, *num_cols]
    
    pd_train = df_train.to_pandas()
    X = pd_train[X_cols]
    y = pd_train[target_col]
    
    # fit the pipeline
    model = train_response_model(X, y, cat_cols, num_cols)
    
    # Test the model
    pd_test = df_test.to_pandas()
    
    X_test = pd_test[X_cols]
    y_test = pd_test[target_col]

    y_pred = model.predict(X_test)
    
    # Create a dict with some test scores based on test data to return
    ret_dict = {"f1_score" : f1_score(y_test, y_pred, average='macro')
                  , "precision_score": precision_score(y_test, y_pred, average='macro')
                  , "recall_score": recall_score(y_test, y_pred, average='macro')
                  , "accuracy_score" : accuracy_score(y_test, y_pred)}

    now = datetime.now()
    # Save the model to stage
    save_path = now.strftime("%Y-%m-%d-%H%M%S")
    model_path = save_file_to_stage(snf_session, model, 'rfc_bank_model.joblib', f'@{stage}/{save_path}/')
    ret_dict['model_path'] = model_path
    return ret_dict

Create a stage for storing the code for the SP, UDF and model file.

If you do not want to remove the old models, skip this step

In [None]:
session.sql(f"CREATE OR REPLACE STAGE {sp_udf_stage}").collect()

Create the Store Procedure in Snowflake.
The **sproc** function returns a callable object that can be used to call the stored procedure.

In [None]:
import sklearn
sklearn.__version__

In [None]:
session.clear_imports()
session.clear_packages()
session.add_packages('snowflake-snowpark-python','pandas', 'scikit-learn==1.2.2', 'joblib')
train_bank_sp = F.sproc(func=train_bank,name="train_bank", is_permanent = True, replace= True, stage_location = f'{sp_udf_stage}/bank/sp/', session=session)

Run the training Stored Procedure in Snowflake

In [None]:
sp_dict = json.loads(train_bank_sp(session, f'{sp_udf_stage}/bank/model'))
sp_dict

Check that the model is saved to the Snowflake stage

In [None]:
session.sql(f"ls @{sp_udf_stage}/bank/model").show(max_width=150)

Deploy model as a UDF

In [None]:
def deploy_model(snf_session, udf_name, udf_stage, model_name, import_model_path, features):
    
    # Function to load the model file, using cachetools makes sure file is only loaded once
    @cachetools.cached(cache={})
    def read_file(filename):
        import joblib
        import sys
        import os

        import_dir = sys._xoptions.get("snowflake_import_directory")
        if import_dir:
            with open(os.path.join(import_dir, filename), 'rb') as file:
                m = joblib.load(file)
                return m
    
    # Use a vectorized udf, gets maximum 100 rows at the time
    @F.udf(name = udf_name, max_batch_size=100, is_permanent = True, stage_location = udf_stage, imports = [import_model_path]
           , packages = ['pandas', 'scikit-learn==1.2.2', 'cachetools'], replace = True, session = snf_session)
    def predict_response(pd_input: T.PandasDataFrame[str, str, str, str, str, str, str, str, str, str, int, int, int, int, int, int]) -> T.PandasSeries[str]:
        # Make sure we have the columns in the expected order in the Pandas Dataframe
        model = read_file(model_name)
        pd_input.columns = features
        prediction = model.predict(pd_input)
        return prediction


Call the deployment function

In [None]:
features = [*cat_cols, *num_cols]
deploy_model(session, "predict_response", f"@{sp_udf_stage}/bank/udf/", "rfc_bank_model.joblib" ,sp_dict['model_path'], features)

Test the UDF with our data

In [None]:
# Generate a list of columns to use with the UDF
input_cols = [F.col(col) for col in features]

df_response_scores = df_bank_marketing.select(*input_cols, F.col(target_col), F.call_function("predict_response", *input_cols).alias('PREDICTION'))
df_response_scores.show()

Using the crosstab method allows us to do a quick confusion matrix

In [None]:
df_response_scores.crosstab("Y", "PREDICTION").show()

In [None]:
session.close()