# Use Claims_Modeling Notebook template

## Import Snowpark Pandas for dumping csv to Snowflake table

In [19]:
#Run only if not using Claims_Modeling Template for execution
'''
! pip install "snowflake-connector-python[pandas]"
! sudo pip install snowflake-ml-python==1.0.11 -U
! pip install --upgrade snowflake-snowpark-python==1.9.0
! pip install --upgrade xgboost==1.7.3
! pip install --upgrade numpy==1.24.3
! pip install --upgrade pandas==1.5.3
! pip install --upgrade anyio==3.5.0
! pip install --upgrade packaging==23.1
! pip install --upgrade scikit-learn==1.3.0
! pip install --upgrade typing-extensions==4.7.1
! pip install --upgrade cryptography==39.0.0
! pip install --upgrade fsspec==2023.9.2
!pip install refractml
'''

'\n! pip install "snowflake-connector-python[pandas]"\n! sudo pip install snowflake-ml-python==1.0.11 -U\n! pip install --upgrade snowflake-snowpark-python==1.9.0\n! pip install --upgrade xgboost==1.7.3\n! pip install --upgrade numpy==1.24.3\n! pip install --upgrade pandas==1.5.3\n! pip install --upgrade anyio==3.5.0\n! pip install --upgrade packaging==23.1\n! pip install --upgrade scikit-learn==1.3.0\n! pip install --upgrade typing-extensions==4.7.1\n! pip install --upgrade cryptography==39.0.0\n! pip install --upgrade fsspec==2023.9.2\n!pip install refractml\n'

In [1]:
from snowflake.snowpark import Session
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
# import seaborn as sns
from sklearn.metrics import mean_absolute_percentage_error
# Pandas Tools
from snowflake.connector.pandas_tools import write_pandas
# Data Science Libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# create_temp_table warning suppresion
import warnings; warnings.simplefilter('ignore')
import configparser

## Code to establish connection and read and dump csv as a snowflake table (handling null values before loading)

In [2]:
config = configparser.ConfigParser()
config.read("snowflake_connection.ini")

['snowflake_connection.ini']

In [3]:
connection_parameters = {
    "user": f'{config["Snowflake"]["user"]}',
    #"password": f'{config["Snowflake"]["password"]}',
    "password": os.getenv('snowflake_password'),
    #"account": f'{config["Snowflake"]["account"]}',
    "account": os.getenv('snowflake_account'),
    "WAREHOUSE": f'{config["Snowflake"]["WAREHOUSE"]}',
    "DATABASE": f'{config["Snowflake"]["DATABASE"]}',
    "SCHEMA": f'{config["Snowflake"]["SCHEMA"]}'
}

In [4]:
def snowflake_connector(conn):
    try:
        session = Session.builder.configs(conn).create()
        print("connection successful!")
    except:
        raise ValueError("error while connecting with db")
    return session

session = snowflake_connector(connection_parameters)

connection successful!


In [5]:
df = session.table("AUTO_INSURANCE_CLAIMS_DATA_PRODUCT")

In [9]:
data = df.to_pandas()

# Shape of Insurance claims data

In [10]:
data.shape

(234265, 40)

# Generic Information of Insurance claims data

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234265 entries, 0 to 234264
Data columns (total 40 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   MONTHS_AS_CUSTOMER           234265 non-null  int16  
 1   CUSTOMER_AGE                 234265 non-null  object 
 2   POLICY_NUMBER                234265 non-null  int32  
 3   POLICY_BIND_DATE             234265 non-null  object 
 4   POLICY_STATE                 234265 non-null  object 
 5   POLICY_CSL                   234265 non-null  object 
 6   POLICY_DEDUCTABLE            234265 non-null  int16  
 7   POLICY_ANNUAL_PREMIUM        234265 non-null  int16  
 8   UMBRELLA_LIMIT               234265 non-null  int32  
 9   INSURED_ZIP                  234265 non-null  object 
 10  INSURED_SEX                  234265 non-null  object 
 11  INSURED_EDUCATION_LEVEL      234265 non-null  object 
 12  INSURED_OCCUPATION           234265 non-null  object 
 13 

# Description of Insurance claims data

In [14]:
data.describe(include='O')

Unnamed: 0,CUSTOMER_AGE,POLICY_BIND_DATE,POLICY_STATE,POLICY_CSL,INSURED_ZIP,INSURED_SEX,INSURED_EDUCATION_LEVEL,INSURED_OCCUPATION,INSURED_HOBBIES,INSURED_RELATIONSHIP,...,AUTHORITIES_CONTACTED,INCIDENT_STATE,INCIDENT_CITY,INCIDENT_LOCATION,INCIDENT_TIME_OF_DAY,PROPERTY_DAMAGE,POLICE_REPORT_AVAILABLE,AUTO_MAKE,AUTO_MODEL,FRAUD_REPORTED
count,234265,234265,234265,234265,234265,234265,234265,234265,234265,234265,...,234265,234265,234265,234265,234265,234265,234265,234265,234265,234265
unique,51,10044,1,3,995,2,7,14,20,6,...,5,1,10,1032,4,3,3,14,39,2
top,33,20-02-2010,CT,100/300,431532,MALE,JD,prof-specialty,kayaking,unmarried,...,Police,CT,Hartford,St,Night Time,Property Damage,Police Report Available,Nissan,RAM,No Fraud Reported
freq,4910,67,234265,78564,337,117455,38426,27637,12104,39357,...,47433,234265,55768,13547,104477,132025,101965,20119,7953,199657


In [10]:
df = df.na.drop()
df = df.drop(['POLICY_NUMBER','MONTHS_AS_CUSTOMER','CUSTOMER_AGE','POLICY_BIND_DATE','POLICY_STATE','POLICY_CSL','UMBRELLA_LIMIT',
 'INSURED_ZIP','INSURED_SEX','INSURED_EDUCATION_LEVEL','INSURED_OCCUPATION','INSURED_HOBBIES','INSURED_RELATIONSHIP',
 'capital-gains','capital-loss', 'INCIDENT_DATE', 'AUTHORITIES_CONTACTED', 'INCIDENT_STATE', 'INCIDENT_CITY',
 'INCIDENT_LOCATION', 'INCIDENT_HOUR_OF_THE_DAY', 'INCIDENT_TIME_OF_DAY', 'WITNESSES', 'POLICE_REPORT_AVAILABLE',
 'INJURY_CLAIM', 'PROPERTY_CLAIM', 'VEHICLE_CLAIM', 'FRAUD_REPORTED'])

In [11]:
df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"POLICY_DEDUCTABLE"  |"POLICY_ANNUAL_PREMIUM"  |"CAPITAL_GAINS"  |"CAPITAL_LOSS"  |"INCIDENT_TYPE"           |"COLLISION_TYPE"  |"INCIDENT_SEVERITY"  |"NUMBER_OF_VEHICLES_INVOLVED"  |"PROPERTY_DAMAGE"      |"BODILY_INJURIES"  |"TOTAL_CLAIM_AMOUNT_PAID"  |"AUTO_MAKE"  |"AUTO_MODEL"  |"AUTO_YEAR"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|1509                 |2067                     |69030            |-20687          |Single Vehicle C

In [15]:
train_df, test_df = df.random_split([0.8,0.2], seed=69) #seed=60 for Model version 2
CATEGORICAL_COLUMNS = ["INCIDENT_TYPE", "COLLISION_TYPE","INCIDENT_SEVERITY", "PROPERTY_DAMAGE",
                       "AUTO_MAKE","AUTO_MODEL","AUTO_YEAR",]
NUMERICAL_COLUMNS = ["POLICY_DEDUCTABLE", "POLICY_ANNUAL_PREMIUM","NUMBER_OF_VEHICLES_INVOLVED", "BODILY_INJURIES"]
LABEL_COLUMNS = ["TOTAL_CLAIM_AMOUNT_PAID"]
OUTPUT_COLUMNS = ["PREDICTION"]

In [16]:
## train_df and test_df are both snowpark dataframes
pipeline = Pipeline(
    steps=[(
                "OE",
                OrdinalEncoder(
                input_cols=CATEGORICAL_COLUMNS,
                output_cols=CATEGORICAL_COLUMNS,
                handle_unknown='use_encoded_value',
                unknown_value=-1
                )),
               ("MMS",
                MinMaxScaler(
                clip=True,
                input_cols=NUMERICAL_COLUMNS,
                output_cols=NUMERICAL_COLUMNS,
                )),
               ("classification",
                XGBRegressor(
                input_cols=CATEGORICAL_COLUMNS+NUMERICAL_COLUMNS,
                label_cols=LABEL_COLUMNS,
                output_cols=OUTPUT_COLUMNS
                ))])

pipeline.fit(train_df) ## fiting the dataset
result = pipeline.predict(test_df)

In [17]:
from joblib import dump, load
filename = "ILF_XGB_Model.joblib" #Model version 1
#filename = "ILF_XGB_Model_v2.joblib" #Model version 2
dump(pipeline, filename)

['ILF_XGB_Model.joblib']

In [18]:
model = load(filename)

In [14]:
from refractml import *
from refractml.constants import MLModelFlavours
import requests
import numpy as np

In [15]:
test = test_df.to_pandas().replace(np.nan, pd.isna)
train = train_df.to_pandas().replace(np.nan, pd.isna)
test["AUTO_YEAR"] = test["AUTO_YEAR"].astype(str)
train["AUTO_YEAR"] = train["AUTO_YEAR"].astype(str)

In [16]:
@scoring_func
def score(model, request):
    payload_dict = request.json["payload"]
    data = pd.DataFrame(payload_dict,index=[0])
    data = data.replace(np.nan, pd.isna)
    data["AUTO_YEAR"] = data["AUTO_YEAR"].astype(str)
    prediction = str(model.predict(data)["PREDICTION"][0])
    return prediction

In [17]:
pred = model.predict(test)
test["PREDICTION"] = pred["PREDICTION"]

In [18]:
X_train = train.drop(["TOTAL_CLAIM_AMOUNT_PAID"], axis=1)
y_train = train["TOTAL_CLAIM_AMOUNT_PAID"]

X_test = pred.drop(["TOTAL_CLAIM_AMOUNT_PAID", "PREDICTION"], axis=1)
y_test = pred["TOTAL_CLAIM_AMOUNT_PAID"]

y_pred = pred["PREDICTION"]

In [19]:
payload  = test.iloc[0].to_dict()
del payload["TOTAL_CLAIM_AMOUNT_PAID"]
del payload["PREDICTION"]
payload

{'POLICY_DEDUCTABLE': 1545,
 'POLICY_ANNUAL_PREMIUM': 1861,
 'CAPITAL_GAINS': 48627,
 'CAPITAL_LOSS': -13390,
 'INCIDENT_TYPE': 'Multi-vehicle Collision',
 'COLLISION_TYPE': 'Front Collision',
 'INCIDENT_SEVERITY': 'Minor Damage',
 'NUMBER_OF_VEHICLES_INVOLVED': 2,
 'PROPERTY_DAMAGE': 'No Property Damage',
 'BODILY_INJURIES': 2,
 'AUTO_MAKE': 'Ford',
 'AUTO_MODEL': 'Escape',
 'AUTO_YEAR': '1997'}

In [20]:
req = requests.Request()
req.json = {"payload":payload}
y_req = req
y_out = score(model, y_req)
y_out

'45137.13'

In [21]:
## SAMPLE PAYLOAD
data = {
  "payload": {
    "POLICY_DEDUCTABLE": 500,
    "POLICY_ANNUAL_PREMIUM": 938,
    "INCIDENT_TYPE": "Vehicle Theft",
    "COLLISION_TYPE": "Details not Available",
    "INCIDENT_SEVERITY": "Total Loss",
    "NUMBER_OF_VEHICLES_INVOLVED": 1,
    "PROPERTY_DAMAGE": "Property Damage",
    "BODILY_INJURIES": 0,
    "AUTO_MAKE": "Honda",
    "AUTO_MODEL": "Civic",
    "AUTO_YEAR": "2002"
  }
}

In [22]:
## registering the model in refract.
model_reg = register_model(model, 
               score, 
               name="ILF_XGB_Model", 
               description="Insurance claim's model trained using SnowflakeML XGB",
               flavour=MLModelFlavours.sklearn,
               model_type="regression",
               init_script="pip install snowflake-ml-python==1.0.11",
               y_true=y_test,
               y_pred=y_pred, 
               features=X_train.columns,
               input_type="json", 
               explain_ai=True,
               x_train=X_train, 
               x_test=X_test, 
               y_train=y_train.tolist(),
               y_test=y_test.tolist(),
               feature_names=X_train.columns.tolist(),
               original_features=X_train.columns.tolist(),
               feature_ids=X_train.columns,
               kyd=True, kyd_score = True)

Calculating build time metrics

Progress: ██████████████████████████████████████████████████████████████████████ 100.0%


VBox(children=(HTML(value='<style>.grad_1{background: #2468a4;} .grad_2{ color:white; background: #2468a4;}</s…

# Code to call ILF XGB Model internally using Insight Designer Notebook

In [39]:
def model_prediction(data):
    model = load('ILF_XGB_Model.joblib', mmap_mode='r')
    data = data.replace(np.nan, pd.isna)
    data["AUTO_YEAR"] = data["AUTO_YEAR"].astype(str)
    return model.predict(data)['PREDICTION'] #Only returns Prediction
    #return model.predict(data) #Returns Complete input data along with Prediction

In [40]:
test['PREDICTION'] = model_prediction(test) #If only Prediction is expected from the score function output
#test = model_prediction(test) #If complete input data along with Prediction is expected from the score function output

In [43]:
test.head()

Unnamed: 0,POLICY_DEDUCTABLE,POLICY_ANNUAL_PREMIUM,CAPITAL_GAINS,CAPITAL_LOSS,INCIDENT_TYPE,COLLISION_TYPE,INCIDENT_SEVERITY,NUMBER_OF_VEHICLES_INVOLVED,PROPERTY_DAMAGE,BODILY_INJURIES,TOTAL_CLAIM_AMOUNT_PAID,AUTO_MAKE,AUTO_MODEL,AUTO_YEAR,PREDICTION
0,1545,1861,48627,-13390,Multi-vehicle Collision,Front Collision,Minor Damage,2,No Property Damage,2,38325.0,Ford,Escape,1997,45004.199219
1,1422,1871,3050,-1639,Single Vehicle Collision,Details not Available,Major Damage,1,Property Damage,2,71290.0,Volkswagen,Jetta,2007,73125.117188
2,1113,1391,32285,-45249,Parked Car,Side Collision,Minor Damage,2,No Property Damage,2,53007.0,Suburu,Forrestor,2007,45793.1875
3,406,548,79600,-53718,Multi-vehicle Collision,Rear Collision,Minor Damage,3,No Property Damage,2,52112.0,Suburu,Impreza,2015,43414.09375
4,1886,2119,16195,-5303,Multi-vehicle Collision,Front Collision,Total Loss,2,Property Damage,0,91874.0,Accura,RSX,2003,91049.640625


In [44]:
#Drift files to be setup using snowflake
drift_old = test.head(10000)
drift_old.to_csv("/data/ILF_Claims_Output_v1.csv", index=False)

drift_new = test.tail(10000)
drift_new.to_csv("/data/ILF_Claims_Output_v2.csv", index=False)

# Code to call ILF XGB Model in external/3rd party application

In [None]:
access_token = "eyJhbGciOiJSUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICJ4WTdTd3k5UE1xaXRDQmNSMm5qcVl6bmoxS3NqZzV3TmdOV0xDVzdyUkhvIn0.eyJleHAiOjE3MzA4ODM3MzgsImlhdCI6MTY5OTI2MTMzOCwiYXV0aF90aW1lIjoxNjk5MjUyMDQ0LCJqdGkiOiJmN2EzMzQwYy1kNDQwLTRlMzUtYjk2ZS04YzBiMTc0Y2RhODAiLCJpc3MiOiJodHRwczovL3JlZnJhY3QtbG9naW4uZm9zZm9yLmNvbS9hdXRoL3JlYWxtcy9tb3NhaWMiLCJhdWQiOlsibW9zYWljLWdhdGVrZWVwZXIiLCJhY2NvdW50Il0sInN1YiI6IjZjMjU4MWU3LWZmMTItNDljNy04MDJmLWI2ZjQzOWQxZDIwMSIsInR5cCI6IkJlYXJlciIsImF6cCI6Im1vc2FpYy1nYXRla2VlcGVyIiwic2Vzc2lvbl9zdGF0ZSI6IjBhY2Y3YWZhLTVmMzMtNGRhZS05OGM3LTQyZDQwYTdlZTM2NiIsImFsbG93ZWQtb3JpZ2lucyI6WyIqIl0sInJlYWxtX2FjY2VzcyI6eyJyb2xlcyI6WyJNTE9QUyIsImxvbmdfbGl2ZWRfdG9rZW4iLCJzcGVjdHJhLWRldmVsb3BlciIsImRlZmF1bHQtcm9sZXMtbW9zYWljIiwicmVmcmFjdC1kZXZlbG9wZXIiLCJvZmZsaW5lX2FjY2VzcyIsImFkbWluIiwidW1hX2F1dGhvcml6YXRpb24iLCJyZWZyYWN0LWFkbWluIl19LCJyZXNvdXJjZV9hY2Nlc3MiOnsiYWNjb3VudCI6eyJyb2xlcyI6WyJtYW5hZ2UtYWNjb3VudCIsIm1hbmFnZS1hY2NvdW50LWxpbmtzIiwidmlldy1wcm9maWxlIl19fSwic2NvcGUiOiJvcGVuaWQgZW1haWwgcHJvZmlsZSIsInNpZCI6IjBhY2Y3YWZhLTVmMzMtNGRhZS05OGM3LTQyZDQwYTdlZTM2NiIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJuYW1lIjoiUmVmcmFjdCBCRlNJIiwicHJlZmVycmVkX3VzZXJuYW1lIjoicmVmcmFjdC5iZnNpQGZvc2Zvci5jb20iLCJnaXZlbl9uYW1lIjoiUmVmcmFjdCIsImZhbWlseV9uYW1lIjoiQkZTSSIsImVtYWlsIjoicmVmcmFjdC5iZnNpQGZvc2Zvci5jb20ifQ.b6SYLgjo9Veo3GmJ8eZjCTNupQjpfMhzsoXdYjWwRtvRnNjBfx0gOqcugO9OcGn-mm8wwpSGI5uiL30-I6SdWBjsf1ur6GztoX7j-nP_3SrJJn3UhNNqIO8LbsPi5gGRTzWtnfjz92BF1YaCXxQwPY0P_aa8vJ6JxZz5Uctn9aIPIJZZnnjC_GPXtXurmshM_tEN2kwCjhEyr7wYzRqUoMtBGfpLjZREBzgZY-x6JyYiXNtycb1d6PFcCXf7nJVV8ienEC_x7OuciDzfeqd-SQnImvAHH7rqFdi9smBN08AbkDS2uAbMrokHrmbiBpaimrR013VwCWz2KL5QYlWleA"

def model(payload, access_token):
    headers = {'Content-Type': 'application/json',
               'Authorization': 'Bearer ' + access_token}
    url = "https://qa.fdc.leni.ai/ilfxgbmodel/4f549c4b-0bca-491a-8c45-5fe870838fcc/score"
    # payload = str(payload.to_dict())
    data = {"payload": payload}
    print("***********************************")
    print(data)
    print("***************************")
    response = requests.post(url, json=data, headers=headers)
    return response #pd.DataFrame(eval(response.json()["data"]))


In [None]:
# data_1 = pd.DataFrame(payload, index=[0])
output1 = model(payload, access_token)

In [None]:
output1.json()["data"]