## Use Customized Notebook Template (CLAIMS_RESERVE_Template)

In [2]:
from snowflake.snowpark import Session
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
# import seaborn as sns
from sklearn.metrics import mean_absolute_percentage_error
# Pandas Tools
from snowflake.connector.pandas_tools import write_pandas
# Data Science Libs
import numpy as np
import pandas as pd
# create_temp_table warning suppresion
import warnings; warnings.simplefilter('ignore')
from joblib import dump, load
# FosforIO to read from snowflake
from fosforio import snowflake
# FosforML to register Model on FDC
from fosforml import *
from fosforml.constants import MLModelFlavours
import requests

# Read data using FosforML

In [3]:
from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()

In [7]:
table_name = 'AUTO_INSURANCE_CLAIMS_DATA'

In [8]:
sf_df = my_session.sql("select * from {}".format(table_name))

In [27]:
df = sf_df
#df = sf_df.to_pandas()

In [3]:
# To read a specific dataset published from a snowflake connection
#df = snowflake.get_dataframe("AUTO_INSURANCE_CLAIMS_DATA_PRODUCT")

In [28]:
df_backup = df

In [29]:
df = df.drop(['POLICY_NUMBER','MONTHS_AS_CUSTOMER','CUSTOMER_AGE','POLICY_BIND_DATE','POLICY_STATE','POLICY_CSL','UMBRELLA_LIMIT',
 'INSURED_ZIP','INSURED_SEX','INSURED_EDUCATION_LEVEL','INSURED_OCCUPATION','INSURED_HOBBIES','INSURED_RELATIONSHIP',
 'CAPITAL_GAINS','CAPITAL_LOSS', 'INCIDENT_DATE', 'AUTHORITIES_CONTACTED', 'INCIDENT_STATE', 'INCIDENT_CITY',
 'INCIDENT_LOCATION', 'INCIDENT_HOUR_OF_THE_DAY', 'INCIDENT_TIME_OF_DAY', 'WITNESSES', 'POLICE_REPORT_AVAILABLE','POLICY_BIND_DATE_CUSTOM',
 'INJURY_CLAIM', 'PROPERTY_CLAIM', 'VEHICLE_CLAIM', 'FRAUD_REPORTED'])

In [30]:
df.show()

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"POLICY_DEDUCTABLE"  |"POLICY_ANNUAL_PREMIUM"  |"INCIDENT_TYPE"           |"COLLISION_TYPE"       |"INCIDENT_SEVERITY"  |"NUMBER_OF_VEHICLES_INVOLVED"  |"PROPERTY_DAMAGE"   |"BODILY_INJURIES"  |"TOTAL_CLAIM_AMOUNT_PAID"  |"AUTO_MAKE"  |"AUTO_MODEL"  |"AUTO_YEAR"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|362                  |489                      |Multi-vehicle Collision   |Side Collision         |Major Damage         |2                              |Property Damage     |0                  |7349

In [31]:
train_df, test_df = df.random_split([0.8,0.2], seed=69) #seed=60 for Model version 2
CATEGORICAL_COLUMNS = ["INCIDENT_TYPE", "COLLISION_TYPE","INCIDENT_SEVERITY", "PROPERTY_DAMAGE",
                       "AUTO_MAKE","AUTO_MODEL","AUTO_YEAR"]
NUMERICAL_COLUMNS = ["POLICY_DEDUCTABLE", "POLICY_ANNUAL_PREMIUM","NUMBER_OF_VEHICLES_INVOLVED", "BODILY_INJURIES"]
LABEL_COLUMNS = ["TOTAL_CLAIM_AMOUNT_PAID"]
OUTPUT_COLUMNS = ["PREDICTION"]

In [32]:
## train_df and test_df are both snowpark dataframes
pipeline = Pipeline(
    steps=[(
                "OE",
                OrdinalEncoder(
                input_cols=CATEGORICAL_COLUMNS,
                output_cols=CATEGORICAL_COLUMNS,
                handle_unknown='use_encoded_value',
                unknown_value=-1
                )),
               ("MMS",
                MinMaxScaler(
                clip=True,
                input_cols=NUMERICAL_COLUMNS,
                output_cols=NUMERICAL_COLUMNS,
                )),
               ("classification",
                XGBRegressor(
                input_cols=CATEGORICAL_COLUMNS+NUMERICAL_COLUMNS,
                label_cols=LABEL_COLUMNS,
                output_cols=OUTPUT_COLUMNS
                ))])

pipeline.fit(train_df) ## fiting the dataset
result = pipeline.predict(test_df)

In [33]:
type(train_df), type(test_df), type(result)

(snowflake.snowpark.dataframe.DataFrame,
 snowflake.snowpark.dataframe.DataFrame,
 snowflake.snowpark.dataframe.DataFrame)

In [34]:
test = test_df.to_pandas().replace(np.nan, pd.isna)
train = train_df.to_pandas().replace(np.nan, pd.isna)
test["AUTO_YEAR"] = test["AUTO_YEAR"].astype(str)
train["AUTO_YEAR"] = train["AUTO_YEAR"].astype(str)

In [35]:
pred = pipeline.predict(test)

In [38]:
X_train = train.drop(["TOTAL_CLAIM_AMOUNT_PAID"], axis=1)
y_train = train["TOTAL_CLAIM_AMOUNT_PAID"]

X_test = pred.drop(["TOTAL_CLAIM_AMOUNT_PAID", "PREDICTION"], axis=1)
y_test = pred["TOTAL_CLAIM_AMOUNT_PAID"]

y_pred = pred["PREDICTION"]

In [43]:
type(X_train), type(X_test), type(y_train), type(y_test), type(y_pred), type(result)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.series.Series,
 pandas.core.series.Series,
 pandas.core.series.Series,
 snowflake.snowpark.dataframe.DataFrame)

# Register Snowflake Model using FosforML

In [47]:
register_model(
  model_obj=pipeline,
  session=my_session,
  name="XGBoost_Claim_predictor",
  snowflake_df=result,
  target_column="PREDICTION",
  dataset_name="AUTO_INSURANCE_CLAIMS_DATA",
  dataset_source="Snowflake",
  source="Notebook",
  description="XGBoost model trained via Notebook to predict claim amount",
  flavour="snowflake",
  model_type="regression",
  conda_dependencies=["scikit-learn==1.3.2"]
)

Calculating build time metrics

Progress: ██████████████████                                                     25.0%
(0000) Length of multilabel label columns should be of the same between y_true_col_names and y_pred_col_names.Got y_true_col_names=['PREDICTION'] (length: 1) vs y_pred_col_names=['INCIDENT_TYPE', 'COLLISION_TYPE', 'INCIDENT_SEVERITY', 'PROPERTY_DAMAGE', 'AUTO_MAKE', 'AUTO_MODEL', 'AUTO_YEAR'] (length: 7).
(0000) Length of multilabel label columns should be of the same between y_true_col_names and y_pred_col_names.Got y_true_col_names=['PREDICTION'] (length: 1) vs y_pred_col_names=['INCIDENT_TYPE', 'COLLISION_TYPE', 'INCIDENT_SEVERITY', 'PROPERTY_DAMAGE', 'AUTO_MAKE', 'AUTO_MODEL', 'AUTO_YEAR'] (length: 7).
(0000) Length of multilabel label columns should be of the same between y_true_col_names and y_pred_col_names.Got y_true_col_names=['PREDICTION'] (length: 1) vs y_pred_col_names=['INCIDENT_TYPE', 'COLLISION_TYPE', 'INCIDENT_SEVERITY', 'PROPERTY_DAMAGE', 'AUTO_MAKE', 

"Model 'MODEL_0309078C_6F8E_4796_BF18_3789B23539DF_FDC_XGBOOST_CLAIM_PREDICTOR' registered successfully."