# Snowpark For Python -- Titanic Survival Prediction

### In this session, we will cover:

* Creating Session object and connecting to Snowflake
* Loading data from Snowflake table into Snowpark DataFrame
* Creating Stored Procedure to deploy model training code on Snowflake
* Creating User-Defined Function (UDF) for inference

### Import libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [None]:
from snowflake.snowpark.session import Session
from snowflake.snowpark import functions as F
from snowflake.snowpark import types as T
from snowflake.snowpark import Window

In [None]:
import sklearn as skl
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

In [None]:
import json
import sqlparse
import pandas as pd
import cachetools

from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

### Connect to Snowflake
Create a connection to Snowflake, Snowpark supports the following authentification methods:
* Username and password
* externalbrowser (Okta, ADFS, or any other SAML 2.0-compliant identity provider (IdP))
* oauth
* Key pair

This example is using a JSON file with the following structure
```
{
    "account":"MY SNOWFLAKE ACCOUNT",
    "user": "MY USER",
    "password":"MY PASSWORD",
    "role":"MY ROLE",
    "warehouse":"MY WH",
    "database":"MY DB",
    "schema":"MY SCHEMA"
}

```

In [None]:
with open('../creds.json') as f:
    connection_parameters = json.load(f)

In [None]:
session = Session.builder.configs(connection_parameters).create()
session.sql_simplifier_enabled = True

In [None]:
print(f"Current schema: {session.get_fully_qualified_current_schema()}")
print(f"Current role: {session.get_current_role()}")
print(f"Current warehouse: {session.get_current_warehouse()}")

### Create a DataFrame based on the Titanic table

In [None]:
titanic_df = session.table("titanic")

In [None]:
print(f"Number of rows: {titanic_df.count()}")
titanic_df.show()

In [None]:
titanic_df.queries

### Basic analysis


Count by Survived

In [None]:
titanic_df.group_by("SURVIVED").count().show()

Add percentages

In [None]:
ratio_to_report = F.function("RATIO_TO_REPORT")
titanic_df.group_by(F.col("SURVIVED")).agg(F.count('*').as_("PASSENGERS"))\
            .select(F.col("SURVIVED"), F.col("PASSENGERS"), (ratio_to_report(F.col("PASSENGERS")).over() * 100).as_("percentage") )\
            .show()

Describe all numeric and categorical feature

In [None]:
titanic_df.describe().show()

Based on above statistics can drop some of the columns

In [None]:
titanic_df = titanic_df.drop(["NAME", "TICKET", "CABIN", "BOAT", "BODY", "HOME_DEST", "SIBSP", "PARCH"])
titanic_df.show()

Let's have a look at the datatypes for the remaining colums

In [None]:
for col in titanic_df.schema.fields:
    print(f"{col.name}, Nullable: {col.nullable}, {col.datatype}")

PCLASS is stored as a number but is a categorical variable so we can change it character instead

In [None]:
titanic_df = titanic_df.with_column("PCLASS", F.to_varchar("PCLASS"))

It also seems like there is null values in EMBARKED

In [None]:
titanic_df.group_by("EMBARKED").count().show()

Replace missing values in EMARKED with S

In [None]:
titanic_df = titanic_df.fillna({"EMBARKED": "S"})
titanic_df.group_by("EMBARKED").count().show()

In [None]:
print(sqlparse.format(titanic_df.queries['queries'][0], reindent=True))

Next letâ€™s look at the relationship between each of the features and our target variable.

In [None]:
cols = [c.name for c in titanic_df.schema.fields if type(c.datatype) == T.StringType]
for col in cols:
    if col != "SURVIVED":
        window = Window.partition_by(col)
        display(titanic_df.group_by(col, "SURVIVED").count().select(col, "SURVIVED", (ratio_to_report(F.col("COUNT")).over(window) * 100).as_("percentage") ).pivot("SURVIVED", ['0', '1']).agg(F.sum("percentage")).show(20))


### Model training

We are going to create a function for training a model that we will run in Snowflake as a Stored Procedure.

Start with selecting the columns we are going to use and pull the data back as a Pandas dataframe so we can test the function locally, if we had have lota of data we would have taken a sample.

In [None]:
df = titanic_df.sample(frac=0.10).to_pandas()

X = df[["EMBARKED", "SEX", "PCLASS", "AGE", "FARE"]]
y = df["SURVIVED"]

Define the training function where we will also do some data preprocessing, by using Pipeline we can then reuse the processing when using the model for prediictions.

In [None]:
def train(X, y):
    
    # Imputer and transformer for categorical columns. Even if we handled missing values in training data we can not be sure it will happen in production.
    cat_cols = ["EMBARKED", "SEX", "PCLASS"]
    cat_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])
    # Imputer and Scaler for numerical columns
    num_cols = ["AGE", "FARE"]
    num_transformer = Pipeline(steps=[
        ('imputer', KNNImputer(n_neighbors=5)),
        ('scaler', RobustScaler())
    ])
    preprocessor = ColumnTransformer(
      [
            ('num', num_transformer, num_cols),
            ('cat', cat_transformer, cat_cols)
        ],  verbose_feature_names_out=False,
    )
    
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier())])
    model = pipe.fit(X, y)
    return model


Test the function locally.

In [None]:
model = train(X, y)

In [None]:
model

Check versions of local packages that we are going to use in Snowflake

In [None]:
print(f"Local Pandas version: {pd.__version__}")
print(f"Local scikit-learn version: {skl.__version__}")

Check version avalible in Snowflake

In [None]:
session.table("information_schema.packages").filter((F.col("language") == 'python') & F.col("PACKAGE_NAME").in_(['pandas', 'scikit-learn']))\
        .sort(F.col("PACKAGE_NAME").asc(), F.col("VERSION").desc()).show(50)

In [None]:
pandas_version = 'pandas==1.4.4'
sklearn_version = 'scikit-learn==1.1.1'

Helper function to create a Snowflake internal stage

In [None]:
def create_stage(snf_session: Session, stage_name: str):
    return snf_session.sql(f"create or replace stage {stage_name}").collect()

In [None]:
create_stage(session, "sp_stage")
create_stage(session, "model_stage")

Helper function used to save a object to a Snowflake stage, used within the training Stored procedure

In [None]:
def save_file_to_stage(snf_session, obj, file_name, stage_path):
    import io
    import joblib
    
    file_path = stage_path + file_name
    
    input_stream = io.BytesIO()
    joblib.dump(obj, input_stream)
    snf_session._conn._cursor.upload_stream(input_stream, file_path)
    
    return file_path


Primary function for the Python Stored Procedure

In [None]:
def train_titanic(snf_session: Session, stage: str) -> dict:
    from datetime import datetime
    
    df_titanic = snf_session.table("titanic").select("EMBARKED", "SEX", "PCLASS", "AGE", "FARE", "SURVIVED")
    df_train, df_test = df_titanic.random_split([0.8, 0.2])
    
    pd_train = df_train.to_pandas()
    
    X = pd_train[["EMBARKED", "SEX", "PCLASS", "AGE", "FARE"]]
    y = pd_train["SURVIVED"]
    
    # fit the pipeline
    model = train(X, y)
    
    # Test the model
    pd_test = df_test.to_pandas()
    
    X_test = pd_test[["EMBARKED", "SEX", "PCLASS", "AGE", "FARE"]]
    y_test = pd_test["SURVIVED"]

    y_pred = model.predict(X_test)
    
    # Create a dict with some test scores based on test data to return
    ret_dict = {"f1_score" : f1_score(y_test, y_pred, average='macro')
                  , "precision_score": precision_score(y_test, y_pred, average='macro')
                  , "recall_score": recall_score(y_test, y_pred, average='macro')
                  , "accuracy_score" : accuracy_score(y_test, y_pred)}

    now = datetime.now()
    # Save the model to stage
    save_path = now.strftime("%Y-%m-%d-%H%M%S")
    model_path = save_file_to_stage(snf_session, model, 'rfc_survive_model.joblib', f'@{stage}/{save_path}/')
    ret_dict['model_path'] = model_path
    return ret_dict

Create the Store Procedure in Snowflake.
The **sproc** function returns a callable object that can be used to call the stored procedure.

In [None]:
session.clear_imports()
session.clear_packages()
session.add_packages('snowflake-snowpark-python',pandas_version, sklearn_version, 'cloudpickle==2.0.0', 'joblib')
train_titanic_sp = F.sproc(func=train_titanic,name="train_titanic", is_permanent = True, replace= True, stage_location = 'sp_stage/titanic/sp/', session=session)

Run the training Stored Procedure in Snowflake

In [None]:
ret_vals = json.loads(train_titanic_sp(session, 'model_stage/titanic'))
ret_vals

Verify that the model is stored

In [None]:
session.sql("ls @model_stage").collect()

Get the model file name and path to it

In [None]:
model_name = ret_vals['model_path'].split('/')[-1]
stage_path = ret_vals['model_path'][:-(len(model_name)+1)]

Deploy model as a UDF

In [None]:
def deploy_model(snf_session, udf_name, udf_stage, model_name, model_path):
    
    import_model_path = model_path + '/' + model_name
    # Function to load the model file, using cachetools makes sure file is only loaded once
    @cachetools.cached(cache={})
    def read_file(filename):
        import joblib
        import sys
        import os

        import_dir = sys._xoptions.get("snowflake_import_directory")
        if import_dir:
            with open(os.path.join(import_dir, filename), 'rb') as file:
                m = joblib.load(file)
                return m
    
    # Use a vectorized udf, gets maximum 100 rows at the time
    @F.udf(name = udf_name, max_batch_size=100, is_permanent = True, stage_location = udf_stage, imports = [import_model_path]
           , packages = [pandas_version, sklearn_version, 'cachetools'], replace = True, session = snf_session)
    def survived(ds: T.PandasSeries[dict]) -> T.PandasSeries[int]:
        # Make sure we have the columns in the expected order in the Pandas Dataframe
        model = read_file(model_name)
        df = pd.io.json.json_normalize(ds)[["EMBARKED", "SEX", "PCLASS", "AGE", "FARE"]]
        prediction = model.predict(df)
        return prediction


In [None]:
deploy_model(session, "predict_survival", "@sp_stage/titanic/udf/", model_name ,stage_path)

Test the deployed mode (UDF)

In [None]:
titanic_df.select(F.call_function("predict_survival", F.object_construct('*')).as_("predicted"), F.col("SURVIVED").as_("actual")).show()

In [None]:
session.close()