# Use Insurance Code Template Notebook Template

## Packages needed to run this notebook if running with inbuilt snowpark 3.8 template
#### !pip install --q "snowflake-connector-python[pandas]"
#### !sudo pip install --q  snowflake-ml-python==1.0.11 -U
#### !pip install --upgrade --q snowflake-snowpark-python==1.9.0
#### !pip install --q seaborn
#### !pip install --upgrade --q xgboost==1.7.3
#### !pip install --upgrade --q numpy==1.24.3
#### !pip install --upgrade --q pandas==1.5.3
#### !pip install --upgrade --q anyio==3.5.0
#### !pip install --upgrade --q packaging==23.1
#### !pip install --upgrade --q scikit-learn==1.3.0
#### !pip install --upgrade --q typing-extensions==4.7.1
#### !pip install --upgrade --q cryptography==39.0.0
#### !pip install --upgrade --q fsspec==2023.9.2
#### !pip install --q xgboost
#### !pip install --q catboost
#### !pip install --q fosforio
#### !pip install --q fosforml

In [2]:
from snowflake.snowpark import Session

# Data Science Libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# create_temp_table warning suppresion
import warnings; warnings.simplefilter('ignore')

# Model libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# FosforIO to read from snowflake
from fosforio import snowflake
# FosforML to register Model on FDC
from joblib import dump, load
from fosforml import *
from fosforml.constants import MLModelFlavours
import requests

Connection manager service url initialised to http://fdc-project-manager:80/project-manager
If you need to update its value then update the variable CONNECTION_MANAGER_BASE_URL in os env.


# Read data using FosforIO

In [3]:
# To get snowflake connection object with a specific connection name
snowflake.get_connection(connection_name="FDC_Insurance_Snowflake")

Connection object created: <snowflake.connector.connection.SnowflakeConnection object at 0x7f4d21556460>
Please close the connection after use!


<snowflake.connector.connection.SnowflakeConnection at 0x7f4d21556460>

In [4]:
# To read a specific dataset published from a snowflake connection
df = snowflake.get_dataframe("AUTO_INSURANCE_CLAIMS_DATA_PRODUCT")

In [5]:
df.head()

Unnamed: 0,MONTHS_AS_CUSTOMER,CUSTOMER_AGE,POLICY_NUMBER,POLICY_BIND_DATE,POLICY_STATE,POLICY_CSL,POLICY_DEDUCTABLE,POLICY_ANNUAL_PREMIUM,UMBRELLA_LIMIT,INSURED_ZIP,...,WITNESSES,POLICE_REPORT_AVAILABLE,TOTAL_CLAIM_AMOUNT_PAID,INJURY_CLAIM,PROPERTY_CLAIM,VEHICLE_CLAIM,AUTO_MAKE,AUTO_MODEL,AUTO_YEAR,FRAUD_REPORTED
0,178,69,431670,12-02-2012,CT,500/1000,1800,2195,6678588,617699,...,2,Police Report Available,102099.0,13040.0,14700.0,74359.0,Toyota,Highlander,1996,No Fraud Reported
1,235,48,452457,15-06-2004,CT,100/300,730,947,9201803,434206,...,3,Police Report Available,65165.0,21811.0,6992.0,36362.0,Volkswagen,Passat,2005,No Fraud Reported
2,292,46,466074,20-01-1996,CT,100/300,1249,1504,7764950,614166,...,3,Police Report Available,81875.0,5613.0,5109.0,71153.0,Ford,Fusion,2006,No Fraud Reported
3,67,41,485048,13-12-2012,CT,100/300,1584,1908,8502385,473243,...,3,Police Report Available,68429.0,308.0,821.0,67300.0,Chevrolet,Tahoe,1995,No Fraud Reported
4,211,58,451276,29-03-1996,CT,250/500,1951,2501,4083163,472895,...,0,Unknown,64077.0,517.0,1164.0,62396.0,Jeep,Grand Cherokee,1995,Fraud Reported


# Read data using Snowflake's Snowpark

In [6]:
#Import all snowflake connection details from Template or Project variables.
db_user = os.getenv('Snowflake_user')
db_password =  os.getenv('Snowflake_password')
db_account = os.getenv('Snowflake_Account')
db_database =  os.getenv('Snowflake_Database')
db_role = os.getenv('Snowflake_user')
db_warehouse = os.getenv('Snowflake_Warehouse')
db_schema = os.getenv('Snowflake_Schema')

In [7]:
from snowflake.snowpark.session import Session
connection_params = {
    'user': db_user,
    'password': db_password,
    'account': db_account,
    'warehouse': db_warehouse,
    'database': db_database,
    'schema': db_schema,
    'role': db_role
}
session = Session.builder.configs(connection_params).create()

In [9]:
session.sql('use warehouse FOSFOR_SOLUTIONS_WH;').collect()
session.sql('use database FDC_Insurance;').collect()
session.sql('use schema FDC_Insurance.PUBLIC;').collect()

df = session.table('FDC_Insurance.PUBLIC.AUTO_INSURANCE_CLAIMS_DATA_PRODUCT')

In [10]:
insurance_claim = df.to_pandas()

In [11]:
insurance_claim.columns = insurance_claim.columns.str.lower()

In [12]:
insurance_claim.columns

Index(['months_as_customer', 'customer_age', 'policy_number',
       'policy_bind_date', 'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital_gains', 'capital_loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day', 'incident_time_of_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount_paid',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported'],
      dtype='object')

In [13]:
insurance_claim.rename(columns = {'total_claim_amount_paid': 'total_claim_amount'}, inplace=True)

In [14]:
insurance_claim=insurance_claim.replace("?",np.NaN)

In [15]:
df = insurance_claim.copy()

In [16]:
# dropping columns which are not necessary for prediction

to_drop = ['policy_number','policy_bind_date','policy_state','insured_zip','incident_location','incident_date',
           'incident_state','incident_city','insured_hobbies','auto_make','auto_model','auto_year']

df.drop(to_drop, inplace=True, axis=1)

In [17]:
# separating the feature and target columns

X = df.drop('fraud_reported', axis = 1)
y = df['fraud_reported']

In [18]:
# extracting categorical columns
cat_df = X.select_dtypes(include = ['object'])

In [19]:
# printing unique values of each column
for col in cat_df.columns:
    print(f"{col}: \n{cat_df[col].unique()}\n")

customer_age: 
['66' '63' '22' '36' '25' '20' '64' '56' '44' '61' '34' '67' '24' '57'
 '59' '21' '39' '51' '46' '60' '52' '65' '33' '29' '62' '40' '49' '48'
 '32' '30' '50' '37' '42' '68' '31' '54' '58' '35' '53' '26' '23' '28'
 '55' '19' '38' '27' '45' '41' '47' '69' '43']

policy_csl: 
['100/300' '500/1000' '250/500']

insured_sex: 
['MALE' 'FEMALE']

insured_education_level: 
['JD' 'Associate' 'High School' 'College' 'MD' 'Masters' 'PhD']

insured_occupation: 
['craft-repair' 'tech-support' 'adm-clerical' 'priv-house-serv'
 'armed-forces' 'transport-moving' 'sales' 'protective-serv'
 'other-service' 'farming-fishing' 'handlers-cleaners' 'exec-managerial'
 'prof-specialty' 'machine-op-inspct']

insured_relationship: 
['not-in-family' 'husband' 'other-relative' 'own-child' 'unmarried' 'wife']

incident_type: 
['Single Vehicle Collision' 'Parked Car' 'Multi-vehicle Collision'
 'Vehicle Theft']

collision_type: 
['Details not Available' 'Side Collision' 'Rear Collision'
 'Front Collisio

In [20]:
cat_df = pd.get_dummies(cat_df, drop_first = True)

In [21]:
num_df = df.select_dtypes(include = ['int16','int8','int32','float64'])

In [22]:
# combining the Numerical and Categorical dataframes to get the final dataset

X = pd.concat([num_df, cat_df], axis = 1)

In [23]:
# splitting data into training set and test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [24]:
num_df = X_train[['months_as_customer', 'policy_deductable', 'umbrella_limit',
       'capital_gains', 'capital_loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'injury_claim', 'property_claim',
       'vehicle_claim']]

In [25]:
num_df_test = X_test[['months_as_customer', 'policy_deductable', 'umbrella_limit',
       'capital_gains', 'capital_loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'injury_claim', 'property_claim',
       'vehicle_claim']]

In [26]:
# Scaling the numeric values in the dataset

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(num_df)

In [27]:
scaled_data_test = scaler.transform(num_df_test)

In [28]:
scaled_num_df = pd.DataFrame(data = scaled_data, columns = num_df.columns, index = X_train.index)
scaled_num_df.head()

Unnamed: 0,months_as_customer,policy_deductable,umbrella_limit,capital_gains,capital_loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,injury_claim,property_claim,vehicle_claim
41591,0.398301,1.903664,1.278529,-1.176387,1.248564,-0.51997,-0.679344,-1.217903,0.447189,-1.030458,0.69998,-1.731233
68393,-0.173392,1.613905,0.087725,-1.180668,-0.335679,0.162623,-0.679344,0.004718,1.343688,0.897328,0.002062,-0.084441
49392,0.82707,-1.76225,-1.754803,-1.27619,0.298598,0.674567,-0.679344,0.004718,-0.449311,0.093105,-0.948522,-0.219096
166020,-0.151403,-1.156561,0.269574,-1.504292,1.031202,-0.008026,2.16848,0.004718,-1.345811,-0.688169,1.257627,0.455203
151378,-1.03093,-1.154692,-0.906854,-1.377397,-1.355369,-1.031915,-0.679344,0.004718,0.447189,1.476952,0.95341,-0.283173


In [29]:
scaled_num_df_test = pd.DataFrame(data = scaled_data_test, columns = num_df_test.columns, index = X_test.index)
scaled_num_df_test.head()

Unnamed: 0,months_as_customer,policy_deductable,umbrella_limit,capital_gains,capital_loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,injury_claim,property_claim,vehicle_claim
221054,0.684147,1.552215,0.491134,-0.485786,1.54468,1.35716,0.744568,-1.217903,-0.449311,-1.033816,0.255874,-0.424714
174016,0.849058,-0.8425,-0.231257,1.592636,-1.560382,1.186512,0.744568,0.004718,-0.449311,-0.294944,-0.966851,-0.777739
22705,2.058408,1.056821,0.632628,0.230107,-1.5595,0.333271,0.744568,-1.217903,0.447189,-1.023601,2.09327,1.361324
157018,0.134443,-1.072438,-0.416811,-0.760731,0.89041,-0.178674,-0.679344,-1.217903,0.447189,-1.023181,-0.960487,1.27047
159165,-0.063451,-1.188341,1.021124,0.925349,1.271623,0.674567,-0.679344,0.004718,1.343688,1.757807,-0.984799,0.076483


In [30]:
X_train.drop(columns = scaled_num_df.columns, inplace = True)

In [31]:
X_train = pd.concat([scaled_num_df, X_train], axis = 1)

In [32]:
X_test.drop(columns = scaled_num_df_test.columns, inplace = True)

In [33]:
X_test = pd.concat([scaled_num_df_test, X_test], axis = 1)

# Decision Tree

In [34]:
#from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

In [35]:
y_pred = dtc.predict(X_test)

In [36]:
dtc_train_acc = accuracy_score(y_train, dtc.predict(X_train))
dtc_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of Decision Tree is : {dtc_train_acc}")
print(f"Test accuracy of Decision Tree is : {dtc_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training accuracy of Decision Tree is : 1.0
Test accuracy of Decision Tree is : 0.9953386719483668
[[ 8452   121]
 [  152 49842]]
                   precision    recall  f1-score   support

   Fraud Reported       0.98      0.99      0.98      8573
No Fraud Reported       1.00      1.00      1.00     49994

         accuracy                           1.00     58567
        macro avg       0.99      0.99      0.99     58567
     weighted avg       1.00      1.00      1.00     58567



# Random Forest Classifier

In [None]:
rand_clf = RandomForestClassifier(criterion= 'entropy', max_depth= 10, max_features= 'sqrt', min_samples_leaf= 1, min_samples_split= 3, n_estimators= 140)
rand_clf.fit(X_train, y_train)

In [40]:
y_pred = rand_clf.predict(X_test)

In [41]:
rand_clf_train_acc = accuracy_score(y_train, rand_clf.predict(X_train))
rand_clf_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of Random Forest is : {rand_clf_train_acc}")
print(f"Test accuracy of Random Forest is : {rand_clf_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training accuracy of Random Forest is : 0.8523944495668704
Test accuracy of Random Forest is : 0.8518961189748493
[[    0  8674]
 [    0 49893]]
                   precision    recall  f1-score   support

   Fraud Reported       0.00      0.00      0.00      8674
No Fraud Reported       0.85      1.00      0.92     49893

         accuracy                           0.85     58567
        macro avg       0.43      0.50      0.46     58567
     weighted avg       0.73      0.85      0.78     58567



# Adaboost Classifier

In [None]:
ada = AdaBoostClassifier(base_estimator = dtc)

parameters = {
    'n_estimators' : [50, 70, 90, 120, 180, 200],
    'learning_rate' : [0.001, 0.01, 0.1, 1, 10],
    'algorithm' : ['SAMME', 'SAMME.R']
}

grid_search = GridSearchCV(ada, parameters, n_jobs = -1, cv = 5, verbose = 1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


In [None]:
# best parameter and best score

print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
# best estimator 

ada = grid_search.best_estimator_

y_pred = ada.predict(X_test)

In [None]:
ada_train_acc = accuracy_score(y_train, ada.predict(X_train))
ada_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of Ada Boost is : {ada_train_acc}")
print(f"Test accuracy of Ada Boost is : {ada_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Gradient Boosting

In [None]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

In [None]:
gb_acc = accuracy_score(y_test, gb.predict(X_test))

In [None]:
print(f"Training Accuracy of Gradient Boosting Classifier is {accuracy_score(y_train, gb.predict(X_train))}")
print(f"Test Accuracy of Gradient Boosting Classifier is {gb_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, gb.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, gb.predict(X_test))}")

# Stochastic Gradient Boosting

In [None]:
sgb = GradientBoostingClassifier(subsample = 0.90, max_features = 0.70)
sgb.fit(X_train, y_train)

In [None]:
sgb_acc = accuracy_score(y_test, sgb.predict(X_test))

In [None]:
print(f"Training Accuracy of Stochastic Gradient Boosting is {accuracy_score(y_train, sgb.predict(X_train))}")
print(f"Test Accuracy of Stochastic Gradient Boosting is {sgb_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, sgb.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, sgb.predict(X_test))}")

# XGBoost Classifier

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [None]:
y_pred = xgb.predict(X_test)

In [None]:
xgb_train_acc = accuracy_score(y_train, xgb.predict(X_train))
xgb_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of XgBoost is : {xgb_train_acc}")
print(f"Test accuracy of XgBoost is : {xgb_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
param_grid = {"n_estimators": [10, 50, 100, 130], "criterion": ['gini', 'entropy'],
                               "max_depth": range(2, 10, 1)}

grid = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5,  verbose=3,n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
xgb = grid_search.best_estimator_

y_pred = xgb.predict(X_test)

In [None]:
xgb_train_acc = accuracy_score(y_train, xgb.predict(X_train))
xgb_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of XgBoost is : {xgb_train_acc}")
print(f"Test accuracy of XgBoost is : {xgb_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Catboost Classifier

In [None]:
cat = CatBoostClassifier(iterations=10)
cat.fit(X_train, y_train)

In [None]:
cat_acc = accuracy_score(y_test, cat.predict(X_test))

In [None]:
print(f"Training Accuracy of Cat Boost Classifier is {accuracy_score(y_train, cat.predict(X_train))}")
print(f"Test Accuracy of Cat Boost Classifier is {cat_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, cat.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, cat.predict(X_test))}")

# Extra Tree Classifier

In [None]:
etc = ExtraTreesClassifier()
etc.fit(X_train, y_train)

In [None]:
etc_acc = accuracy_score(y_test, etc.predict(X_test))

In [None]:
print(f"Training Accuracy of Extra Trees Classifier is {accuracy_score(y_train, etc.predict(X_train))}")
print(f"Test Accuracy of Extra Trees Classifier is {etc_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, etc.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, etc.predict(X_test))}")

# Voting Classifier

In [None]:
# Voting Classifier
classifiers = [ ('Decision Tree', dtc), ('Random Forest', rand_clf),
               ('Ada Boost', ada), ('XGboost', xgb), ('Gradient Boosting Classifier', gb), ('SGB', sgb),
               ('Cat Boost', cat), ('Extra Trees Classifier', etc)]

vc = VotingClassifier(estimators = classifiers)
vc.fit(X_train, y_train)

In [None]:
y_pred = vc.predict(X_test)

In [None]:
vc_train_acc = accuracy_score(y_train, vc.predict(X_train))
vc_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of Voting Classifier is : {vc_train_acc}")
print(f"Test accuracy of Voting Classifier is : {vc_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Model evaluation

In [None]:
models = pd.DataFrame({
    'Model' : ['Decision Tree', 'Random Forest','Ada Boost', 'Gradient Boost', 'SGB', 'Cat Boost', 'Extra Trees','XgBoost', 'Voting Classifier'],
    'Score' : [dtc_test_acc, rand_clf_test_acc, ada_test_acc, gb_acc, sgb_acc, cat_acc, etc_acc, xgb_test_acc, vc_test_acc]
})


models.sort_values(by = 'Score', ascending = False)

In [None]:
px.bar(data_frame = models, x = 'Score', y = 'Model', color = 'Score', template = 'plotly_dark', 
       title = 'Models Comparison')