# Detecting Auto insurance fraud claims

In [2]:
!pip install -U imbalanced-learn



In [3]:
!pip install mlxtend



In [4]:
!pip install streamlit



In [5]:
# Import all the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import classification_report 
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, accuracy_score, recall_score, roc_curve, precision_recall_curve, auc


In [6]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_Interactivity = 'all'

In [7]:
# Read dataset
data = pd.read_csv("insurance_claims.csv")

In [8]:
data.shape

(1000, 40)

In [9]:
# fraud_reported is our target column. We will convert it to 1 and 0 and build the target dataframe.
data['fraud_reported'] = data['fraud_reported'].str.replace('Y', '1')
data['fraud_reported'] = data['fraud_reported'].str.replace('N', '0')
data['fraud_reported'] = data['fraud_reported'].astype(int)
data_target = data['fraud_reported']
data_target.shape

(1000,)

In [10]:
data.shape

(1000, 40)

In [11]:
data.columns

Index(['months_as_customer', 'age', 'policy_number', 'policy_bind_date',
       'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital-gains', 'capital-loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported', '_c39'],
      dtype='object')

In [12]:
# In another notebook 'Auto Insurance Fraud claim detection-background analysis' where I used techniques like correlation matrix and Chi square 
# for feature selection, below are the important features I have selected to use for my app to predict the fraud claims.

# 'insured_hobbies', 'collision_type',  'months_as_customer_groups', 'policy_deductable', 'incident_severity',
# 'vehicle_claim_groups', 'umbrella_limit', 'number_of_vehicles_involved' ,'bodily_injuries','witnesses', 'incident_type', 
# 'authorities_contacted' ,'police_report_available'

# Reading the dataset with all the above columns:
data_app = pd.read_csv("data_req.csv")
data_app.head()   # It has only the input features.


Unnamed: 0,insured_hobbies_new,collision_type_new,months_as_customer_groups,policy_deductable,incident_severity,vehicle_claim_groups,umbrella_limit,number_of_vehicles_involved,bodily_injuries,witnesses,incident_type,authorities_contacted,police_report_available_new
0,other,Side Collision,301-350,1000,Major Damage,50001-60000,0,1,1,2,Single Vehicle Collision,Police,YES
1,other,other,201-250,2000,Minor Damage,0-10000,5000000,1,0,0,Vehicle Theft,Police,other
2,other,Rear Collision,101-150,2000,Minor Damage,20001-30000,5000000,3,2,3,Multi-vehicle Collision,Police,NO
3,other,Front Collision,251-300,2000,Major Damage,50001-60000,6000000,1,1,2,Single Vehicle Collision,Police,NO
4,other,other,201-250,1000,Minor Damage,0-10000,6000000,1,0,1,Vehicle Theft,,NO


In [13]:
data_app.shape

(1000, 13)

# label encoding

In [14]:
data_app.dtypes    

insured_hobbies_new            object
collision_type_new             object
months_as_customer_groups      object
policy_deductable               int64
incident_severity              object
vehicle_claim_groups           object
umbrella_limit                  int64
number_of_vehicles_involved     int64
bodily_injuries                 int64
witnesses                       int64
incident_type                  object
authorities_contacted          object
police_report_available_new    object
dtype: object

In [15]:
# label endcoding for the object datatypes.

for col in ['insured_hobbies_new', 'collision_type_new', 'months_as_customer_groups', 'incident_severity',
'vehicle_claim_groups', 'incident_type', 'authorities_contacted' ,'police_report_available_new']:
    if (data_app[col].dtype == 'object'):
        le = preprocessing.LabelEncoder()
        le = le.fit(data_app[col])
        data_app[col] = le.transform(data_app[col])
        print('Completed Label encoding on',col)


Completed Label encoding on insured_hobbies_new
Completed Label encoding on collision_type_new
Completed Label encoding on months_as_customer_groups
Completed Label encoding on incident_severity
Completed Label encoding on vehicle_claim_groups
Completed Label encoding on incident_type
Completed Label encoding on authorities_contacted
Completed Label encoding on police_report_available_new


In [16]:
data_app.head()

Unnamed: 0,insured_hobbies_new,collision_type_new,months_as_customer_groups,policy_deductable,incident_severity,vehicle_claim_groups,umbrella_limit,number_of_vehicles_involved,bodily_injuries,witnesses,incident_type,authorities_contacted,police_report_available_new
0,2,2,5,1000,0,5,0,1,1,2,2,4,1
1,2,3,3,2000,1,0,5000000,1,0,0,3,4,2
2,2,1,1,2000,1,2,5000000,3,2,3,0,4,0
3,2,0,4,2000,0,5,6000000,1,1,2,2,4,0
4,2,3,3,1000,1,0,6000000,1,0,1,3,2,0


In [17]:
# Feature Scaling.

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
data_scaled = sc.fit_transform(data_app)


In [18]:
# Splitting data into train and test

x_train, x_test, y_train, y_test = train_test_split(data_scaled, data_target, random_state = 1)
print('x_train:', x_train.shape, 'x_test:', x_test.shape, 'y_train:', y_train.shape, 'y_test:', y_test.shape )


x_train: (750, 13) x_test: (250, 13) y_train: (750,) y_test: (250,)


# Logistic regression

In [19]:
log = LogisticRegression()

log.fit(x_train, y_train)
prediction = log.predict(x_test)

score = log.score(x_test, y_test)
print(score*100)
print()
print(classification_report(y_test, prediction))


82.8

              precision    recall  f1-score   support

           0       0.83      0.96      0.89       180
           1       0.83      0.49      0.61        70

    accuracy                           0.83       250
   macro avg       0.83      0.72      0.75       250
weighted avg       0.83      0.83      0.81       250



# Decision Tree

In [20]:
dtc = DecisionTreeClassifier()

dtc.fit(x_train, y_train)
preds = dtc.predict(x_test)

score = dtc.score(x_test, y_test)
print(score*100)
print()
print(classification_report(y_test, preds))

78.8

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       180
           1       0.66      0.50      0.57        70

    accuracy                           0.79       250
   macro avg       0.74      0.70      0.71       250
weighted avg       0.78      0.79      0.78       250



# Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 1)
rfc.fit(x_train, y_train)
preds = rfc.predict(x_test)

score = rfc.score(x_test, y_test)
print(score*100)
print()
print(classification_report(y_test, preds))

82.39999999999999

              precision    recall  f1-score   support

           0       0.85      0.92      0.88       180
           1       0.73      0.59      0.65        70

    accuracy                           0.82       250
   macro avg       0.79      0.75      0.77       250
weighted avg       0.82      0.82      0.82       250



# Random Forest with grid search CV

In [22]:
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
parameters = {'n_estimators':[100,300],'n_jobs':[-1], 'max_features': [0.5,0.7,0.9], 'min_samples_split': [2, 5, 10, 15],'max_depth': [3,5,7,15],'min_samples_leaf':[1,2,5,10],'random_state':[14]} 

clf1 = GridSearchCV(RandomForestClassifier(), parameters, cv=5, scoring='roc_auc')
clf1.fit(x_train, y_train)
pred_tuned = clf1.predict(x_test)

score = clf1.score(x_test, y_test)
print(score*100)
print()
print(classification_report(y_test, preds))


90.65873015873017

              precision    recall  f1-score   support

           0       0.85      0.92      0.88       180
           1       0.73      0.59      0.65        70

    accuracy                           0.82       250
   macro avg       0.79      0.75      0.77       250
weighted avg       0.82      0.82      0.82       250



In [23]:
# predictions on test using clf model.
pred_tuned = clf1.predict(x_test)
pred_tuned.shape

(250,)

In [24]:
# Creating dataframe with actual and predicted values to compare.

diff = pd.DataFrame({'Actual': y_test, 'Predicted': pred_tuned})
diff.head()


Unnamed: 0,Actual,Predicted
507,0,0
818,0,0
452,0,0
368,1,0
242,0,0


In [25]:
# Adding new columns 'actual' and 'predictions' to data_app in order to understand what type of records are predicted correctly.

data_app['actual'] = diff['Actual']
data_app['predictions'] = diff['Predicted']


In [26]:
# In below result, we will see many actual and predictions as NaN because data_app consists of 1000 rows and with my model, I
# predicted on only 250 rows.

data_app


Unnamed: 0,insured_hobbies_new,collision_type_new,months_as_customer_groups,policy_deductable,incident_severity,vehicle_claim_groups,umbrella_limit,number_of_vehicles_involved,bodily_injuries,witnesses,incident_type,authorities_contacted,police_report_available_new,actual,predictions
0,2,2,5,1000,0,5,0,1,1,2,2,4,1,,
1,2,3,3,2000,1,0,5000000,1,0,0,3,4,2,,
2,2,1,1,2000,1,2,5000000,3,2,3,0,4,0,0.0,0.0
3,2,0,4,2000,0,5,6000000,1,1,2,2,4,0,1.0,1.0
4,2,3,3,1000,1,0,6000000,1,0,1,3,2,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2,0,0,1000,1,6,0,1,0,1,2,1,2,0.0,0.0
996,2,1,4,1000,0,7,0,1,2,3,2,1,2,,
997,2,2,1,500,1,5,3000000,3,2,3,0,4,1,0.0,0.0
998,2,1,8,2000,0,3,5000000,1,0,1,2,3,1,,


In [27]:
# Dropping all the rows with NaN values.
# So that I can have a cleaned view of my predictions.

data_app_prediction = data_app.dropna()
print(data_app_prediction.shape)
data_app_prediction.head()



2021-08-03 10:49:39.802 INFO    numexpr.utils: Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2021-08-03 10:49:39.802 INFO    numexpr.utils: NumExpr defaulting to 8 threads.


(250, 15)


Unnamed: 0,insured_hobbies_new,collision_type_new,months_as_customer_groups,policy_deductable,incident_severity,vehicle_claim_groups,umbrella_limit,number_of_vehicles_involved,bodily_injuries,witnesses,incident_type,authorities_contacted,police_report_available_new,actual,predictions
2,2,1,1,2000,1,2,5000000,3,2,3,0,4,0,0.0,0.0
3,2,0,4,2000,0,5,6000000,1,1,2,2,4,0,1.0,1.0
6,2,0,1,1000,1,5,0,3,0,0,0,4,2,0.0,0.0
8,2,0,0,500,2,2,0,1,1,1,2,4,1,0.0,0.0
12,2,1,9,500,2,4,3000000,1,1,0,2,0,0,0.0,0.0


# With Grid search CV applied on random Forest model, accuracy increased to 90.6%, hence we will be using random forest clf1 as our final model and deploy it to production to predict Auto insurance fraud claim detection.

In [31]:
# Finally save model using pickle and build app uisng SteamLit

In [28]:
import streamlit as st
import pickle

# save the model
filename = 'Streamlit_Autoinsurancefraud.pkl'
pickle.dump(clf1, open(filename,'wb'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(x_test, y_test)
print(result)


0.9065873015873016


In [29]:
# Now we will use another notebook named 'Predict Auto Insurance Fraud(streamlit)' 
# to build the app to predict the whether a reported claim is fraud or genuine using streamlit and again save the notebook in 
# C folder with name 'streamlitpredictautoinsurancefraud'