# Use Insurance Code Template Notebook Template

## Packages needed to run this notebook if running with inbuilt snowpark 3.8 template
#### !pip install --q "snowflake-connector-python[pandas]"
#### !sudo pip install --q snowflake-ml-python==1.0.11 -U
#### !pip install --upgrade --q snowflake-snowpark-python==1.9.0
#### !pip install --q seaborn
#### !pip install --upgrade --q xgboost==1.7.3
#### !pip install --upgrade --q numpy==1.24.3
#### !pip install --upgrade --q pandas==1.5.3
#### !pip install --upgrade --q anyio==3.5.0
#### !pip install --upgrade --q packaging==23.1
#### !pip install --upgrade --q scikit-learn==1.3.0
#### !pip install --upgrade --q typing-extensions==4.7.1
#### !pip install --upgrade --q cryptography==39.0.0
#### !pip install --upgrade --q fsspec==2023.9.2
#### !pip install --q xgboost
#### !pip install --q catboost
#### !pip install --q fosforio
#### !pip install --q fosforml

In [1]:
from snowflake.snowpark import Session

# Data Science Libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# create_temp_table warning suppresion
import warnings; warnings.simplefilter('ignore')

#Modeling Libs
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# FosforIO to read from snowflake
from fosforio import snowflake
# FosforML to register Model on FDC
from joblib import dump, load
from fosforml import *
from fosforml.constants import MLModelFlavours
import requests

Matplotlib created a temporary cache directory at /tmp/matplotlib-8e7lzdes because the default path (/home/mosaic-ai/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


Connection manager service url initialised to http://fdc-project-manager:80/project-manager
If you need to update its value then update the variable CONNECTION_MANAGER_BASE_URL in os env.


# Read data using FosforIO

In [2]:
# To get snowflake connection object with a specific connection name
snowflake.get_connection(connection_name="FDC_Insurance_Snowflake")

Connection object created: <snowflake.connector.connection.SnowflakeConnection object at 0x7fdc2465fac0>
Please close the connection after use!


<snowflake.connector.connection.SnowflakeConnection at 0x7fdc2465fac0>

In [3]:
# To read a specific dataset published from a snowflake connection
df = snowflake.get_dataframe("AUTO_INSURANCE_CLAIMS_DATA_PRODUCT")

In [4]:
df.head()

Unnamed: 0,MONTHS_AS_CUSTOMER,CUSTOMER_AGE,POLICY_NUMBER,POLICY_BIND_DATE,POLICY_STATE,POLICY_CSL,POLICY_DEDUCTABLE,POLICY_ANNUAL_PREMIUM,UMBRELLA_LIMIT,INSURED_ZIP,...,WITNESSES,POLICE_REPORT_AVAILABLE,TOTAL_CLAIM_AMOUNT_PAID,INJURY_CLAIM,PROPERTY_CLAIM,VEHICLE_CLAIM,AUTO_MAKE,AUTO_MODEL,AUTO_YEAR,FRAUD_REPORTED
0,178,69,431670,12-02-2012,CT,500/1000,1800,2195,6678588,617699,...,2,Police Report Available,102099.0,13040.0,14700.0,74359.0,Toyota,Highlander,1996,No Fraud Reported
1,235,48,452457,15-06-2004,CT,100/300,730,947,9201803,434206,...,3,Police Report Available,65165.0,21811.0,6992.0,36362.0,Volkswagen,Passat,2005,No Fraud Reported
2,292,46,466074,20-01-1996,CT,100/300,1249,1504,7764950,614166,...,3,Police Report Available,81875.0,5613.0,5109.0,71153.0,Ford,Fusion,2006,No Fraud Reported
3,67,41,485048,13-12-2012,CT,100/300,1584,1908,8502385,473243,...,3,Police Report Available,68429.0,308.0,821.0,67300.0,Chevrolet,Tahoe,1995,No Fraud Reported
4,211,58,451276,29-03-1996,CT,250/500,1951,2501,4083163,472895,...,0,Unknown,64077.0,517.0,1164.0,62396.0,Jeep,Grand Cherokee,1995,Fraud Reported


# Read data using Snowflake's Snowpark

In [5]:
#Import all snowflake connection details from Template or Project variables.
db_user = os.getenv('Snowflake_user')
db_password =  os.getenv('Snowflake_password')
db_account = os.getenv('Snowflake_Account')
db_database =  os.getenv('Snowflake_Database')
db_role = os.getenv('Snowflake_user')
db_warehouse = os.getenv('Snowflake_Warehouse')
db_schema = os.getenv('Snowflake_Schema')

In [6]:
from snowflake.snowpark.session import Session
connection_params = {
    'user': db_user,
    'password': db_password,
    'account': db_account,
    'warehouse': db_warehouse,
    'database': db_database,
    'schema': db_schema,
    'role': db_role
}
session = Session.builder.configs(connection_params).create()

In [77]:
session.sql('use warehouse FOSFOR_SOLUTIONS_WH;').collect()
session.sql('use database FDC_Insurance;').collect()
session.sql('use schema FDC_Insurance.PUBLIC;').collect()

df = session.table('FDC_Insurance.PUBLIC.AUTO_INSURANCE_CLAIMS_DATA_PRODUCT')
insurance_claim = df.to_pandas()

In [78]:
insurance_claim.columns = insurance_claim.columns.str.lower()

In [79]:
insurance_claim.columns

Index(['months_as_customer', 'customer_age', 'policy_number',
       'policy_bind_date', 'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital_gains', 'capital_loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day', 'incident_time_of_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount_paid',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported'],
      dtype='object')

In [80]:
insurance_claim.rename(columns = {'total_claim_amount_paid': 'total_claim_amount'}, inplace=True)

In [81]:
insurance_claim=insurance_claim.replace("?",np.NaN)

In [82]:
df = insurance_claim.copy()

In [83]:
actual_inference = df.copy()
actual_inference.drop('fraud_reported', axis = 1, inplace=True)

In [84]:
actual_inference.head()

Unnamed: 0,months_as_customer,customer_age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year
0,213,66,432849,30-10-2001,CT,100/300,1546,2208,4146654,604147,...,2,3,No Police Report Available,71085.0,17655.0,1045.0,52385.0,Toyota,Camry,1999
1,255,63,427352,17-04-2001,CT,100/300,1801,2501,3320595,613114,...,0,2,No Police Report Available,67762.0,474.0,4447.0,62841.0,Saab,92x,2004
2,32,22,442056,13-10-1998,CT,100/300,1903,2642,8838774,602258,...,1,1,Unknown,51272.0,6672.0,1121.0,43479.0,Chevrolet,Malibu,2011
3,11,36,465577,22-12-1992,CT,500/1000,1612,1941,9620213,432218,...,0,0,Police Report Available,92164.0,325.0,17178.0,74661.0,Audi,A5,2005
4,18,25,433126,05-03-2013,CT,250/500,2080,2418,9926398,605692,...,0,1,No Police Report Available,40113.0,400.0,925.0,38788.0,Saab,95,2000


In [85]:
# dropping columns which are not necessary for prediction

to_drop = ['policy_number','policy_bind_date','policy_state','insured_zip','incident_location','incident_date',
           'incident_state','incident_city','insured_hobbies','auto_make','auto_model','auto_year']

df.drop(to_drop, inplace=True, axis=1)

In [86]:
# separating the feature and target columns

X = df.drop('fraud_reported', axis = 1)
y = df['fraud_reported']

In [87]:
# extracting categorical columns
cat_df = X.select_dtypes(include = ['object'])

In [88]:
cat_df.columns

Index(['customer_age', 'policy_csl', 'insured_sex', 'insured_education_level',
       'insured_occupation', 'insured_relationship', 'incident_type',
       'collision_type', 'incident_severity', 'authorities_contacted',
       'incident_time_of_day', 'property_damage', 'police_report_available'],
      dtype='object')

In [89]:
# printing unique values of each column
for col in cat_df.columns:
    print(f"{col}: \n{cat_df[col].unique()}\n")

customer_age: 
['66' '63' '22' '36' '25' '20' '64' '56' '44' '61' '34' '67' '24' '57'
 '59' '21' '39' '51' '46' '60' '52' '65' '33' '29' '62' '40' '49' '48'
 '32' '30' '50' '37' '42' '68' '31' '54' '58' '35' '53' '26' '23' '28'
 '55' '19' '38' '27' '45' '41' '47' '69' '43']

policy_csl: 
['100/300' '500/1000' '250/500']

insured_sex: 
['MALE' 'FEMALE']

insured_education_level: 
['JD' 'Associate' 'High School' 'College' 'MD' 'Masters' 'PhD']

insured_occupation: 
['craft-repair' 'tech-support' 'adm-clerical' 'priv-house-serv'
 'armed-forces' 'transport-moving' 'sales' 'protective-serv'
 'other-service' 'farming-fishing' 'handlers-cleaners' 'exec-managerial'
 'prof-specialty' 'machine-op-inspct']

insured_relationship: 
['not-in-family' 'husband' 'other-relative' 'own-child' 'unmarried' 'wife']

incident_type: 
['Single Vehicle Collision' 'Parked Car' 'Multi-vehicle Collision'
 'Vehicle Theft']

collision_type: 
['Details not Available' 'Side Collision' 'Rear Collision'
 'Front Collisio

In [90]:
cat_df = pd.get_dummies(cat_df, drop_first = True)

In [91]:
num_df = df.select_dtypes(include = ['int16','int8','int32','float64'])

In [92]:
num_df.columns

Index(['months_as_customer', 'policy_deductable', 'policy_annual_premium',
       'umbrella_limit', 'capital_gains', 'capital_loss',
       'incident_hour_of_the_day', 'number_of_vehicles_involved',
       'bodily_injuries', 'witnesses', 'total_claim_amount', 'injury_claim',
       'property_claim', 'vehicle_claim'],
      dtype='object')

In [93]:
# combining the Numerical and Categorical dataframes to get the final dataset

X = pd.concat([num_df, cat_df], axis = 1)

In [94]:
# splitting data into training set and test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [95]:
num_df = X_train[['months_as_customer', 'policy_deductable', 'umbrella_limit',
       'capital_gains', 'capital_loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'injury_claim', 'property_claim',
       'vehicle_claim']]

In [96]:
num_df_test = X_test[['months_as_customer', 'policy_deductable', 'umbrella_limit',
       'capital_gains', 'capital_loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'injury_claim', 'property_claim',
       'vehicle_claim']]

In [97]:
# Scaling the numeric values in the dataset

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(num_df)

In [98]:
scaled_data_test = scaler.transform(num_df_test)

In [99]:
scaled_num_df = pd.DataFrame(data = scaled_data, columns = num_df.columns, index = X_train.index)
scaled_num_df.head()

Unnamed: 0,months_as_customer,policy_deductable,umbrella_limit,capital_gains,capital_loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,injury_claim,property_claim,vehicle_claim
185659,0.561313,1.248439,1.505983,1.039427,-0.989383,-2.058331,2.170981,1.22618,-1.344742,0.941607,1.104487,0.029543
165436,-0.800847,-0.350607,1.242188,-0.636122,1.056366,0.844126,-0.678445,0.003937,-1.344742,0.762049,1.992616,0.707144
99123,0.121907,-0.589717,0.35118,-1.484047,-1.696628,1.356324,-0.678445,-1.218305,-1.344742,-1.0138,0.782697,-0.922386
153977,-0.152722,0.13882,-0.861763,-0.767826,-0.42462,-0.863201,-0.678445,0.003937,0.447805,-0.742713,-0.954305,-0.989503
24298,-1.163357,0.088383,-0.155933,-1.549089,-0.858614,1.014859,-0.678445,-1.218305,0.447805,-1.000644,-0.510432,1.577234


In [100]:
scaled_num_df_test = pd.DataFrame(data = scaled_data_test, columns = num_df_test.columns, index = X_test.index)
scaled_num_df_test.head()

Unnamed: 0,months_as_customer,policy_deductable,umbrella_limit,capital_gains,capital_loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,injury_claim,property_claim,vehicle_claim
147877,-0.064841,1.85929,0.726617,-1.666503,-0.150908,-0.521736,-0.678445,0.003937,0.447805,0.143042,1.088861,1.066117
128005,-0.844787,-1.579781,-1.752858,1.662494,1.170889,-0.18027,-0.678445,1.22618,-0.448468,0.301187,-0.97082,-1.360459
79206,-0.888728,-1.363087,0.149047,0.91143,-1.596966,-0.009538,-0.678445,1.22618,-0.448468,1.878864,0.559617,-0.119795
95780,0.70412,-1.151998,-0.570701,-1.405723,-0.141294,-1.033934,0.746268,0.003937,-1.344742,1.8902,-0.973361,-0.023168
114281,0.110922,-0.096553,0.055475,-0.132269,-0.295656,1.014859,2.170981,-1.218305,-1.344742,-1.015339,-0.949097,1.715163


In [101]:
X_train.drop(columns = scaled_num_df.columns, inplace = True)

In [102]:
X_train = pd.concat([scaled_num_df, X_train], axis = 1)

In [103]:
X_test.drop(columns = scaled_num_df_test.columns, inplace = True)

In [104]:
X_test = pd.concat([scaled_num_df_test, X_test], axis = 1)

# Decision Tree

In [105]:
#from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

In [106]:
y_pred = dtc.predict(X_test)

In [107]:
dtc_train_acc = accuracy_score(y_train, dtc.predict(X_train))
dtc_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of Decision Tree is : {dtc_train_acc}")
print(f"Test accuracy of Decision Tree is : {dtc_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training accuracy of Decision Tree is : 1.0
Test accuracy of Decision Tree is : 0.9939385660867042
[[ 8309   152]
 [  203 49903]]
                   precision    recall  f1-score   support

   Fraud Reported       0.98      0.98      0.98      8461
No Fraud Reported       1.00      1.00      1.00     50106

         accuracy                           0.99     58567
        macro avg       0.99      0.99      0.99     58567
     weighted avg       0.99      0.99      0.99     58567



In [108]:
actual_model_columns = ['months_as_customer',
 'policy_deductable',
 'umbrella_limit',
 'capital_gains',
 'capital_loss',
 'incident_hour_of_the_day',
 'number_of_vehicles_involved',
 'bodily_injuries',
 'witnesses',
 'injury_claim',
 'property_claim',
 'vehicle_claim',
 'policy_annual_premium',
 'total_claim_amount',
 'customer_age_20',
 'customer_age_21',
 'customer_age_22',
 'customer_age_23',
 'customer_age_24',
 'customer_age_25',
 'customer_age_26',
 'customer_age_27',
 'customer_age_28',
 'customer_age_29',
 'customer_age_30',
 'customer_age_31',
 'customer_age_32',
 'customer_age_33',
 'customer_age_34',
 'customer_age_35',
 'customer_age_36',
 'customer_age_37',
 'customer_age_38',
 'customer_age_39',
 'customer_age_40',
 'customer_age_41',
 'customer_age_42',
 'customer_age_43',
 'customer_age_44',
 'customer_age_45',
 'customer_age_46',
 'customer_age_47',
 'customer_age_48',
 'customer_age_49',
 'customer_age_50',
 'customer_age_51',
 'customer_age_52',
 'customer_age_53',
 'customer_age_54',
 'customer_age_55',
 'customer_age_56',
 'customer_age_57',
 'customer_age_58',
 'customer_age_59',
 'customer_age_60',
 'customer_age_61',
 'customer_age_62',
 'customer_age_63',
 'customer_age_64',
 'customer_age_65',
 'customer_age_66',
 'customer_age_67',
 'customer_age_68',
 'customer_age_69',
 'policy_csl_250/500',
 'policy_csl_500/1000',
 'insured_sex_MALE',
 'insured_education_level_College',
 'insured_education_level_High School',
 'insured_education_level_JD',
 'insured_education_level_MD',
 'insured_education_level_Masters',
 'insured_education_level_PhD',
 'insured_occupation_armed-forces',
 'insured_occupation_craft-repair',
 'insured_occupation_exec-managerial',
 'insured_occupation_farming-fishing',
 'insured_occupation_handlers-cleaners',
 'insured_occupation_machine-op-inspct',
 'insured_occupation_other-service',
 'insured_occupation_priv-house-serv',
 'insured_occupation_prof-specialty',
 'insured_occupation_protective-serv',
 'insured_occupation_sales',
 'insured_occupation_tech-support',
 'insured_occupation_transport-moving',
 'insured_relationship_not-in-family',
 'insured_relationship_other-relative',
 'insured_relationship_own-child',
 'insured_relationship_unmarried',
 'insured_relationship_wife',
 'incident_type_Parked Car',
 'incident_type_Single Vehicle Collision',
 'incident_type_Vehicle Theft',
 'collision_type_Front Collision',
 'collision_type_Rear Collision',
 'collision_type_Side Collision',
 'incident_severity_Minor Damage',
 'incident_severity_Total Loss',
 'authorities_contacted_Fire',
 'authorities_contacted_None',
 'authorities_contacted_Other',
 'authorities_contacted_Police',
 'incident_time_of_day_Early Morning Hours',
 'incident_time_of_day_Morning to Noon',
 'incident_time_of_day_Night Time',
 'property_damage_No Property Damage',
 'property_damage_Property Damage',
 'police_report_available_Police Report Available',
 'police_report_available_Unknown']

In [109]:
@scoring_func
def score(model, request):
    payload_dict = request.json["payload"]
    df = pd.DataFrame(payload_dict,index=[0])
    df = df.fillna(0)
    df.columns = df.columns.str.lower()
    df.drop(to_drop, inplace=True, axis=1)
    
    # extracting categorical columns
    cat_df = df[['customer_age','policy_csl','insured_sex','insured_education_level','insured_occupation','insured_relationship','incident_type','collision_type','incident_severity','authorities_contacted','incident_time_of_day','property_damage','police_report_available']]
    cat_df = pd.get_dummies(cat_df, drop_first = True)

    num_df = df[['months_as_customer', 'policy_deductable', 'policy_annual_premium',
       'umbrella_limit', 'capital_gains', 'capital_loss',
       'incident_hour_of_the_day', 'number_of_vehicles_involved',
       'bodily_injuries', 'witnesses', 'total_claim_amount', 'injury_claim',
       'property_claim', 'vehicle_claim']]
    
    X = pd.concat([num_df, cat_df], axis = 1)
    
    num_df_test = X[['months_as_customer', 'policy_deductable', 'umbrella_limit',
       'capital_gains', 'capital_loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'injury_claim', 'property_claim',
       'vehicle_claim']]

    scaled_data_test = scaler.transform(num_df_test)
    scaled_num_df_test = pd.DataFrame(data = scaled_data_test, columns = num_df_test.columns, index = X.index)
    
    X.drop(columns = scaled_num_df_test.columns, inplace = True)
    X = pd.concat([scaled_num_df_test, X], axis = 1)
    
    missing_features = [missing_col for missing_col in actual_model_columns if missing_col not in X.columns]
    X[missing_features] = 0
    
    prediction = model.predict(X[actual_model_columns])
    probability = model.predict_proba(X)[:,1]
    return {"prediction" : prediction, "probability" : probability}

In [110]:
insurance_claim.columns = insurance_claim.columns.str.upper()
insurance_claims = insurance_claim.copy()
insurance_claims.drop('FRAUD_REPORTED', axis = 1, inplace=True)
#payload = insurance_claims.iloc[0].to_dict()
payload = actual_inference.iloc[0].to_dict()

In [111]:
payload

{'months_as_customer': 213,
 'customer_age': '66',
 'policy_number': 432849,
 'policy_bind_date': '30-10-2001',
 'policy_state': 'CT',
 'policy_csl': '100/300',
 'policy_deductable': 1546,
 'policy_annual_premium': 2208,
 'umbrella_limit': 4146654,
 'insured_zip': '604147',
 'insured_sex': 'MALE',
 'insured_education_level': 'JD',
 'insured_occupation': 'craft-repair',
 'insured_hobbies': 'camping',
 'insured_relationship': 'not-in-family',
 'capital_gains': 67272,
 'capital_loss': -65017,
 'incident_date': datetime.date(2021, 11, 12),
 'incident_type': 'Single Vehicle Collision',
 'collision_type': 'Details not Available',
 'incident_severity': 'Major Damage',
 'authorities_contacted': 'Other',
 'incident_state': 'CT',
 'incident_city': 'Hartford',
 'incident_location': 'Flute',
 'incident_hour_of_the_day': 20,
 'incident_time_of_day': 'Night Time',
 'number_of_vehicles_involved': 1,
 'property_damage': 'No Property Damage',
 'bodily_injuries': 2,
 'witnesses': 3,
 'police_report_avai

In [112]:
req = requests.Request()
req.json = {"payload":payload}
y_req = req
y_out = score(dtc, y_req)
y_out

{'prediction': array(['No Fraud Reported'], dtype=object),
 'probability': array([1.])}

In [113]:
req.json

{'payload': {'months_as_customer': 213,
  'customer_age': '66',
  'policy_number': 432849,
  'policy_bind_date': '30-10-2001',
  'policy_state': 'CT',
  'policy_csl': '100/300',
  'policy_deductable': 1546,
  'policy_annual_premium': 2208,
  'umbrella_limit': 4146654,
  'insured_zip': '604147',
  'insured_sex': 'MALE',
  'insured_education_level': 'JD',
  'insured_occupation': 'craft-repair',
  'insured_hobbies': 'camping',
  'insured_relationship': 'not-in-family',
  'capital_gains': 67272,
  'capital_loss': -65017,
  'incident_date': datetime.date(2021, 11, 12),
  'incident_type': 'Single Vehicle Collision',
  'collision_type': 'Details not Available',
  'incident_severity': 'Major Damage',
  'authorities_contacted': 'Other',
  'incident_state': 'CT',
  'incident_city': 'Hartford',
  'incident_location': 'Flute',
  'incident_hour_of_the_day': 20,
  'incident_time_of_day': 'Night Time',
  'number_of_vehicles_involved': 1,
  'property_damage': 'No Property Damage',
  'bodily_injuries':

In [115]:
y_pred = dtc.predict(X_test)
y_prob = dtc.predict_proba(X_test)[:,1]

In [116]:
y_pred

array(['Fraud Reported', 'No Fraud Reported', 'No Fraud Reported', ...,
       'No Fraud Reported', 'No Fraud Reported', 'Fraud Reported'],
      dtype=object)

In [117]:
y_prob

array([0., 1., 1., ..., 1., 1., 0.])

In [118]:
## registering the model in Fosfor Insight Designer.
model_reg = register_model(dtc, 
               score, 
               name="Claims_Fraud_DTree_Classifier", 
               description="Insurance claim's model trained for Fraud Classification",
               flavour=MLModelFlavours.sklearn,
               model_type="classification",
               init_script="snowflake-connector-python[pandas]",
               y_true=y_test,
               y_pred=y_pred, 
               prob=y_prob,
               features=X_train.columns,
               input_type="json", 
               explain_ai=True,
               x_train=X_train, 
               x_test=X_test, 
               y_train=y_train.tolist(),
               y_test=y_test.tolist(),
               feature_names=X_train.columns.tolist(),
               original_features=X_train.columns.tolist(),
               feature_ids=X_train.columns,
               kyd=True, kyd_score = True)

Calculating build time metrics

Progress: ██████████████████████████████████████████████████████████████████████ 100.0%


VBox(children=(HTML(value='<style>.grad_1{background: #2468a4;} .grad_2{ color:white; background: #2468a4;}</s…

# Version 2 of Decision Tree Fraud Classifier Model

In [125]:
dtc = DecisionTreeClassifier(random_state=7)
dtc.fit(X_train, y_train)

In [126]:
y_pred = dtc.predict(X_test)
y_prob = dtc.predict_proba(X_test)[:,1]

In [127]:
dtc_train_acc = accuracy_score(y_train, dtc.predict(X_train))
dtc_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of Decision Tree is : {dtc_train_acc}")
print(f"Test accuracy of Decision Tree is : {dtc_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training accuracy of Decision Tree is : 1.0
Test accuracy of Decision Tree is : 0.9941263851657076
[[ 8305   156]
 [  188 49918]]
                   precision    recall  f1-score   support

   Fraud Reported       0.98      0.98      0.98      8461
No Fraud Reported       1.00      1.00      1.00     50106

         accuracy                           0.99     58567
        macro avg       0.99      0.99      0.99     58567
     weighted avg       0.99      0.99      0.99     58567



In [128]:
## registering the model in Fosfor Insight Designer using same name.
model_reg = register_model(dtc, 
               score, 
               name="Claims_Fraud_DTree_Classifier", 
               description="Insurance claim's model trained for Fraud Classification",
               flavour=MLModelFlavours.sklearn,
               model_type="classification",
               init_script="snowflake-connector-python[pandas]",
               y_true=y_test,
               y_pred=y_pred, 
               prob=y_prob,
               features=X_train.columns,
               input_type="json", 
               explain_ai=True,
               x_train=X_train, 
               x_test=X_test, 
               y_train=y_train.tolist(),
               y_test=y_test.tolist(),
               feature_names=X_train.columns.tolist(),
               original_features=X_train.columns.tolist(),
               feature_ids=X_train.columns,
               kyd=True, kyd_score = True)

Calculating build time metrics

Progress: ██████████████████████████████████████████████████████████████████████ 100.0%


VBox(children=(HTML(value='<style>.grad_1{background: #2468a4;} .grad_2{ color:white; background: #2468a4;}</s…