# Use Customized Notebook Template (CLAIMS_RESERVE_Template)

In [1]:
from snowflake.snowpark import Session

# Data Science Libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# create_temp_table warning suppresion
import warnings; warnings.simplefilter('ignore')

# Model libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from fosforml import *
from fosforml.constants import MLModelFlavours
import requests

Matplotlib created a temporary cache directory at /tmp/matplotlib-dq9rinnd because the default path (/home/mosaic-ai/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


Connection manager service url initialised to http://fdc-project-manager:80/project-manager
If you need to update its value then update the variable CONNECTION_MANAGER_BASE_URL in os env.


# Read data using FosforIO

In [2]:
# To get snowflake connection object with a specific connection name
snowflake.get_connection(connection_name="FDC_Insurance_Snowflake")

Connection object created: <snowflake.connector.connection.SnowflakeConnection object at 0x7fc0b2953cd0>
Please close the connection after use!


<snowflake.connector.connection.SnowflakeConnection at 0x7fc0b2953cd0>

In [3]:
# To read a specific dataset published from a snowflake connection
df = snowflake.get_dataframe("AUTO_INSURANCE_CLAIMS_DATA")
#df = snowflake.get_dataframe("AUTO_INSURANCE_CLAIMS_DATA_PRODUCT")

In [4]:
df.head()

Unnamed: 0,MONTHS_AS_CUSTOMER,CUSTOMER_AGE,POLICY_NUMBER,POLICY_BIND_DATE,POLICY_STATE,POLICY_CSL,POLICY_DEDUCTABLE,POLICY_ANNUAL_PREMIUM,UMBRELLA_LIMIT,INSURED_ZIP,...,WITNESSES,POLICE_REPORT_AVAILABLE,TOTAL_CLAIM_AMOUNT_PAID,INJURY_CLAIM,PROPERTY_CLAIM,VEHICLE_CLAIM,AUTO_MAKE,AUTO_MODEL,AUTO_YEAR,FRAUD_REPORTED
0,178,69,431670,12-02-2012,CT,500/1000,1800,2195,6678588,617699,...,2,Police Report Available,102099.0,13040.0,14700.0,74359.0,Toyota,Highlander,1996,No Fraud Reported
1,235,48,452457,15-06-2004,CT,100/300,730,947,9201803,434206,...,3,Police Report Available,65165.0,21811.0,6992.0,36362.0,Volkswagen,Passat,2005,No Fraud Reported
2,292,46,466074,20-01-1996,CT,100/300,1249,1504,7764950,614166,...,3,Police Report Available,81875.0,5613.0,5109.0,71153.0,Ford,Fusion,2006,No Fraud Reported
3,67,41,485048,13-12-2012,CT,100/300,1584,1908,8502385,473243,...,3,Police Report Available,68429.0,308.0,821.0,67300.0,Chevrolet,Tahoe,1995,No Fraud Reported
4,211,58,451276,29-03-1996,CT,250/500,1951,2501,4083163,472895,...,0,Unknown,64077.0,517.0,1164.0,62396.0,Jeep,Grand Cherokee,1995,Fraud Reported


# Read data using Snowflake's Snowpark

In [5]:
#Import all snowflake connection details from Template or Project variables.
db_user = os.getenv('Snowflake_user')
db_password =  os.getenv('Snowflake_password')
db_account = os.getenv('Snowflake_Account')
db_database =  os.getenv('Snowflake_Database')
db_role = os.getenv('Snowflake_user')
db_warehouse = os.getenv('Snowflake_Warehouse')
db_schema = os.getenv('Snowflake_Schema')

In [6]:
from snowflake.snowpark.session import Session
connection_params = {
    'user': db_user,
    'password': db_password,
    'account': db_account,
    'warehouse': db_warehouse,
    'database': db_database,
    'schema': db_schema,
    'role': db_role
}
session = Session.builder.configs(connection_params).create()

In [7]:
session.sql('use warehouse FOSFOR_SOLUTIONS_WH;').collect()
session.sql('use database FDC_Insurance;').collect()
session.sql('use schema FDC_Insurance.PUBLIC;').collect()

df = session.table('FDC_Insurance.PUBLIC.AUTO_INSURANCE_CLAIMS_DATA')
#df = session.table('FDC_Insurance.PUBLIC.AUTO_INSURANCE_CLAIMS_DATA_PRODUCT')

In [8]:
insurance_claim = df.to_pandas()

In [9]:
insurance_claim.columns = insurance_claim.columns.str.lower()

In [10]:
insurance_claim.columns

Index(['months_as_customer', 'customer_age', 'policy_number',
       'policy_bind_date', 'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital_gains', 'capital_loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day', 'incident_time_of_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount_paid',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported'],
      dtype='object')

In [11]:
insurance_claim.rename(columns = {'total_claim_amount_paid': 'total_claim_amount'}, inplace=True)

In [12]:
insurance_claim=insurance_claim.replace("?",np.NaN)

In [13]:
df = insurance_claim.copy()

In [14]:
# dropping columns which are not necessary for prediction

to_drop = ['policy_number','policy_bind_date','policy_state','insured_zip','incident_location','incident_date',
           'incident_state','incident_city','insured_hobbies','auto_make','auto_model','auto_year']

df.drop(to_drop, inplace=True, axis=1)

In [15]:
# separating the feature and target columns

X = df.drop('fraud_reported', axis = 1)
y = df['fraud_reported']

In [16]:
# extracting categorical columns
cat_df = X.select_dtypes(include = ['object'])

In [17]:
# printing unique values of each column
for col in cat_df.columns:
    print(f"{col}: \n{cat_df[col].unique()}\n")

customer_age: 
['66' '63' '22' '36' '25' '20' '64' '56' '44' '61' '34' '67' '24' '57'
 '59' '21' '39' '51' '46' '60' '52' '65' '33' '29' '62' '40' '49' '48'
 '32' '30' '50' '37' '42' '68' '31' '54' '58' '35' '53' '26' '23' '28'
 '55' '19' '38' '27' '45' '41' '47' '69' '43']

policy_csl: 
['100/300' '500/1000' '250/500']

insured_sex: 
['MALE' 'FEMALE']

insured_education_level: 
['JD' 'Associate' 'High School' 'College' 'MD' 'Masters' 'PhD']

insured_occupation: 
['craft-repair' 'tech-support' 'adm-clerical' 'priv-house-serv'
 'armed-forces' 'transport-moving' 'sales' 'protective-serv'
 'other-service' 'farming-fishing' 'handlers-cleaners' 'exec-managerial'
 'prof-specialty' 'machine-op-inspct']

insured_relationship: 
['not-in-family' 'husband' 'other-relative' 'own-child' 'unmarried' 'wife']

incident_type: 
['Single Vehicle Collision' 'Parked Car' 'Multi-vehicle Collision'
 'Vehicle Theft']

collision_type: 
['Details not Available' 'Side Collision' 'Rear Collision'
 'Front Collisio

In [18]:
cat_df = pd.get_dummies(cat_df, drop_first = True)

In [19]:
num_df = df.select_dtypes(include = ['int16','int8','int32','float64'])

In [20]:
# combining the Numerical and Categorical dataframes to get the final dataset

X = pd.concat([num_df, cat_df], axis = 1)

In [21]:
# splitting data into training set and test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [22]:
num_df = X_train[['months_as_customer', 'policy_deductable', 'umbrella_limit',
       'capital_gains', 'capital_loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'injury_claim', 'property_claim',
       'vehicle_claim']]

In [23]:
num_df_test = X_test[['months_as_customer', 'policy_deductable', 'umbrella_limit',
       'capital_gains', 'capital_loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'injury_claim', 'property_claim',
       'vehicle_claim']]

In [24]:
# Scaling the numeric values in the dataset

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(num_df)

In [25]:
scaled_data_test = scaler.transform(num_df_test)

In [26]:
scaled_num_df = pd.DataFrame(data = scaled_data, columns = num_df.columns, index = X_train.index)
scaled_num_df.head()

Unnamed: 0,months_as_customer,policy_deductable,umbrella_limit,capital_gains,capital_loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,injury_claim,property_claim,vehicle_claim
175292,2.032034,0.850215,-0.462042,0.036684,-0.569366,0.674654,-0.676935,1.228025,-0.448884,1.243609,-0.977675,0.057678
210243,-0.284956,-1.675015,-1.755664,-1.254706,-0.751185,1.016133,0.750681,1.228025,1.344426,-0.729768,-0.976533,0.510525
175223,-0.471633,-1.452585,0.18343,1.206992,0.438035,0.845394,-0.676935,0.0054,-0.448884,1.904058,-0.976787,0.392693
76900,2.010072,-0.331092,0.270283,0.930427,1.584427,0.162434,-0.676935,-1.217225,-0.448884,-1.025677,1.05236,-0.223805
116188,0.165265,0.203487,0.499922,-0.245674,-0.03656,-0.179046,-0.676935,-1.217225,-0.448884,-1.011,0.592188,-0.500955


In [27]:
scaled_num_df_test = pd.DataFrame(data = scaled_data_test, columns = num_df_test.columns, index = X_test.index)
scaled_num_df_test.head()

Unnamed: 0,months_as_customer,policy_deductable,umbrella_limit,capital_gains,capital_loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,injury_claim,property_claim,vehicle_claim
115009,0.165265,-1.602118,-1.755664,0.093393,0.35591,-0.862005,-0.676935,1.228025,1.344426,1.92782,-0.962954,0.141735
96096,0.637448,1.227784,0.870981,0.245889,0.544538,-1.032745,-0.676935,0.0054,-1.345539,1.668393,0.125036,0.478725
151860,-0.43869,-1.048847,1.061565,-1.337862,-1.582525,0.162434,-0.676935,0.0054,0.447771,0.67206,-0.830715,0.585022
121778,0.725296,0.244608,1.292735,1.098503,-0.269529,0.333174,0.750681,1.228025,-1.345539,-0.00027,-0.970314,-0.883065
178823,1.362193,0.010964,-0.762669,0.365568,0.60506,0.674654,0.750681,1.228025,-1.345539,-0.280104,0.950579,0.412576


In [28]:
X_train.drop(columns = scaled_num_df.columns, inplace = True)

In [29]:
X_train = pd.concat([scaled_num_df, X_train], axis = 1)

In [30]:
X_test.drop(columns = scaled_num_df_test.columns, inplace = True)

In [31]:
X_test = pd.concat([scaled_num_df_test, X_test], axis = 1)

# Decision Tree

In [32]:
#from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

In [33]:
y_pred = dtc.predict(X_test)

In [34]:
dtc_train_acc = accuracy_score(y_train, dtc.predict(X_train))
dtc_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of Decision Tree is : {dtc_train_acc}")
print(f"Test accuracy of Decision Tree is : {dtc_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training accuracy of Decision Tree is : 1.0
Test accuracy of Decision Tree is : 0.9940239383953421
[[ 8553   145]
 [  205 49664]]
                   precision    recall  f1-score   support

   Fraud Reported       0.98      0.98      0.98      8698
No Fraud Reported       1.00      1.00      1.00     49869

         accuracy                           0.99     58567
        macro avg       0.99      0.99      0.99     58567
     weighted avg       0.99      0.99      0.99     58567



# Random Forest Classifier

In [35]:
rand_clf = RandomForestClassifier(criterion= 'entropy', max_depth= 6, max_features= 'sqrt', 
                                  min_samples_leaf= 1, min_samples_split= 3, n_estimators= 50)

In [36]:
rand_clf.fit(X_train, y_train)

In [37]:
y_pred = rand_clf.predict(X_test)

In [38]:
rand_clf_train_acc = accuracy_score(y_train, rand_clf.predict(X_train))
rand_clf_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of Random Forest is : {rand_clf_train_acc}")
print(f"Test accuracy of Random Forest is : {rand_clf_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training accuracy of Random Forest is : 0.852531047593029
Test accuracy of Random Forest is : 0.8514863318933871
[[    0  8698]
 [    0 49869]]
                   precision    recall  f1-score   support

   Fraud Reported       0.00      0.00      0.00      8698
No Fraud Reported       0.85      1.00      0.92     49869

         accuracy                           0.85     58567
        macro avg       0.43      0.50      0.46     58567
     weighted avg       0.73      0.85      0.78     58567



# Gradient Boosting

In [39]:
gb = GradientBoostingClassifier()

In [40]:
gb.fit(X_train, y_train)

In [41]:
gb_acc = accuracy_score(y_test, gb.predict(X_test))

In [42]:
print(f"Training Accuracy of Gradient Boosting Classifier is {accuracy_score(y_train, gb.predict(X_train))}")
print(f"Test Accuracy of Gradient Boosting Classifier is {gb_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, gb.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, gb.predict(X_test))}")

Training Accuracy of Gradient Boosting Classifier is 0.8527359446322668
Test Accuracy of Gradient Boosting Classifier is 0.8516058531254802 

Confusion Matrix :- 
[[    7  8691]
 [    0 49869]]

Classification Report :- 
                    precision    recall  f1-score   support

   Fraud Reported       1.00      0.00      0.00      8698
No Fraud Reported       0.85      1.00      0.92     49869

         accuracy                           0.85     58567
        macro avg       0.93      0.50      0.46     58567
     weighted avg       0.87      0.85      0.78     58567



# Stochastic Gradient Boosting

In [43]:
sgb = GradientBoostingClassifier(subsample = 0.90, max_features = 0.70)
sgb.fit(X_train, y_train)

In [44]:
sgb_acc = accuracy_score(y_test, sgb.predict(X_test))

In [45]:
print(f"Training Accuracy of Stochastic Gradient Boosting is {accuracy_score(y_train, sgb.predict(X_train))}")
print(f"Test Accuracy of Stochastic Gradient Boosting is {sgb_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, sgb.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, sgb.predict(X_test))}")

Training Accuracy of Stochastic Gradient Boosting is 0.8526733372036107
Test Accuracy of Stochastic Gradient Boosting is 0.8515375552785699 

Confusion Matrix :- 
[[    3  8695]
 [    0 49869]]

Classification Report :- 
                    precision    recall  f1-score   support

   Fraud Reported       1.00      0.00      0.00      8698
No Fraud Reported       0.85      1.00      0.92     49869

         accuracy                           0.85     58567
        macro avg       0.93      0.50      0.46     58567
     weighted avg       0.87      0.85      0.78     58567



# XGBoost Classifier

In [48]:
type(y_train)

pandas.core.series.Series

In [49]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
xgb_y_train = le.fit_transform(y_train)
xgb_y_test = le.transform(y_test)

In [50]:
xgb = XGBClassifier()
xgb.fit(X_train, xgb_y_train)

In [51]:
y_pred = xgb.predict(X_test)

In [52]:
xgb_train_acc = accuracy_score(xgb_y_train, xgb.predict(X_train))
xgb_test_acc = accuracy_score(xgb_y_test, y_pred)

print(f"Training accuracy of XgBoost is : {xgb_train_acc}")
print(f"Test accuracy of XgBoost is : {xgb_test_acc}")

print(confusion_matrix(xgb_y_test, y_pred))
print(classification_report(xgb_y_test, y_pred))

Training accuracy of XgBoost is : 0.8670445878723719
Test accuracy of XgBoost is : 0.8603309030682808
[[  535  8163]
 [   17 49852]]
              precision    recall  f1-score   support

           0       0.97      0.06      0.12      8698
           1       0.86      1.00      0.92     49869

    accuracy                           0.86     58567
   macro avg       0.91      0.53      0.52     58567
weighted avg       0.88      0.86      0.80     58567



# Catboost Classifier

In [53]:
cat = CatBoostClassifier(iterations=10)
cat.fit(X_train, y_train)

Learning rate set to 0.5
0:	learn: 0.4768173	total: 66.9ms	remaining: 602ms
1:	learn: 0.4257421	total: 87.1ms	remaining: 348ms
2:	learn: 0.4158646	total: 105ms	remaining: 244ms
3:	learn: 0.4129188	total: 123ms	remaining: 184ms
4:	learn: 0.4111623	total: 140ms	remaining: 140ms
5:	learn: 0.4102624	total: 158ms	remaining: 105ms
6:	learn: 0.4089223	total: 177ms	remaining: 75.9ms
7:	learn: 0.4082601	total: 197ms	remaining: 49.4ms
8:	learn: 0.4075961	total: 215ms	remaining: 23.8ms
9:	learn: 0.4069333	total: 244ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fbfaff488b0>

In [54]:
cat_acc = accuracy_score(y_test, cat.predict(X_test))

In [55]:
print(f"Training Accuracy of Cat Boost Classifier is {accuracy_score(y_train, cat.predict(X_train))}")
print(f"Test Accuracy of Cat Boost Classifier is {cat_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, cat.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, cat.predict(X_test))}")

Training Accuracy of Cat Boost Classifier is 0.8526391876970711
Test Accuracy of Cat Boost Classifier is 0.8514863318933871 

Confusion Matrix :- 
[[    3  8695]
 [    3 49866]]

Classification Report :- 
                    precision    recall  f1-score   support

   Fraud Reported       0.50      0.00      0.00      8698
No Fraud Reported       0.85      1.00      0.92     49869

         accuracy                           0.85     58567
        macro avg       0.68      0.50      0.46     58567
     weighted avg       0.80      0.85      0.78     58567



# Extra Tree Classifier

In [56]:
etc = ExtraTreesClassifier()
etc.fit(X_train, y_train)

In [57]:
etc_acc = accuracy_score(y_test, etc.predict(X_test))

In [58]:
print(f"Training Accuracy of Extra Trees Classifier is {accuracy_score(y_train, etc.predict(X_train))}")
print(f"Test Accuracy of Extra Trees Classifier is {etc_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, etc.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, etc.predict(X_test))}")

Training Accuracy of Extra Trees Classifier is 1.0
Test Accuracy of Extra Trees Classifier is 0.9969436713507607 

Confusion Matrix :- 
[[ 8519   179]
 [    0 49869]]

Classification Report :- 
                    precision    recall  f1-score   support

   Fraud Reported       1.00      0.98      0.99      8698
No Fraud Reported       1.00      1.00      1.00     49869

         accuracy                           1.00     58567
        macro avg       1.00      0.99      0.99     58567
     weighted avg       1.00      1.00      1.00     58567



# Voting Classifier

In [59]:
# Voting Classifier
classifiers = [ ('Decision Tree', dtc), ('Random Forest', rand_clf),
                ('Gradient Boosting Classifier', gb), ('SGB', sgb),
               ('Cat Boost', cat), ('Extra Trees Classifier', etc)]

vc = VotingClassifier(estimators = classifiers)
vc.fit(X_train, y_train)

Learning rate set to 0.5
0:	learn: 0.4768173	total: 18.1ms	remaining: 163ms
1:	learn: 0.4257421	total: 37.6ms	remaining: 150ms
2:	learn: 0.4158646	total: 55.4ms	remaining: 129ms
3:	learn: 0.4129188	total: 73.5ms	remaining: 110ms
4:	learn: 0.4111623	total: 91.4ms	remaining: 91.4ms
5:	learn: 0.4102624	total: 109ms	remaining: 73ms
6:	learn: 0.4089223	total: 128ms	remaining: 54.7ms
7:	learn: 0.4082601	total: 151ms	remaining: 37.8ms
8:	learn: 0.4075961	total: 174ms	remaining: 19.3ms
9:	learn: 0.4069333	total: 195ms	remaining: 0us


In [60]:
y_pred = vc.predict(X_test)

In [61]:
vc_train_acc = accuracy_score(y_train, vc.predict(X_train))
vc_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of Voting Classifier is : {vc_train_acc}")
print(f"Test accuracy of Voting Classifier is : {vc_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training accuracy of Voting Classifier is : 0.8528725426584253
Test accuracy of Voting Classifier is : 0.851657076510663
[[   10  8688]
 [    0 49869]]
                   precision    recall  f1-score   support

   Fraud Reported       1.00      0.00      0.00      8698
No Fraud Reported       0.85      1.00      0.92     49869

         accuracy                           0.85     58567
        macro avg       0.93      0.50      0.46     58567
     weighted avg       0.87      0.85      0.78     58567



# Model evaluation

In [62]:
models = pd.DataFrame({
    'Model' : ['Decision Tree', 'Random Forest' ,'Gradient Boost', 'SGB', 'Cat Boost', 'Extra Trees', 'Voting Classifier'],
    'Score' : [dtc_test_acc, rand_clf_test_acc, gb_acc, sgb_acc, cat_acc, etc_acc,  vc_test_acc]
})


models.sort_values(by = 'Score', ascending = False)

Unnamed: 0,Model,Score
5,Extra Trees,0.996944
0,Decision Tree,0.994024
6,Voting Classifier,0.851657
2,Gradient Boost,0.851606
3,SGB,0.851538
1,Random Forest,0.851486
4,Cat Boost,0.851486
