# Insurance Claim Modeling Techniques

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [4]:
columns = [
    "months_as_customer","age","policy_state","policy_csl","policy_deductable",
    "policy_annual_premium", "umbrella_limit","insured_zip", "insured_sex","insured_education_level","insured_occupation",
    "insured_relationship","capital-gains","capital-loss","incident_date","incident_type","collision_type",
    "incident_severity","authorities_contacted","incident_state","incident_city","incident_location",
    "incident_hour_of_the_day","number_of_vehicles_involved","property_damage","bodily_injuries","witnesses",
    "police_report_available","total_claim_amount","injury_claim","property_claim","vehicle_claim","auto_make","auto_model",
    "auto_year"
]

target = ["fraud_reported"]

In [5]:
# Load the data
file_path = Path('Resources/insurance_claims.csv')
df = pd.read_csv(file_path)
df = df.drop(["policy_number", "policy_bind_date","insured_hobbies"], axis=1)

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df.head()

Unnamed: 0,months_as_customer,age,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,insured_education_level,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,OH,250/500,1000,1406.91,0,466132,MALE,MD,...,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,IN,250/500,2000,1197.22,5000000,468176,MALE,MD,...,0,?,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,OH,100/300,2000,1413.14,5000000,430632,FEMALE,PhD,...,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,IL,250/500,2000,1415.74,6000000,608117,FEMALE,PhD,...,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,IL,500/1000,1000,1583.91,6000000,610706,MALE,Associate,...,1,NO,6500,1300,650,4550,Accura,RSX,2009,N


# Split the Data into Training and Testing

In [6]:
# Create our features
X = df.drop("fraud_reported", axis=1)
X = pd.get_dummies(X)

# Create our target
y = df["fraud_reported"]
X.head()

Unnamed: 0,months_as_customer,age,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,...,auto_model_Pathfinder,auto_model_RAM,auto_model_RSX,auto_model_Silverado,auto_model_TL,auto_model_Tahoe,auto_model_Ultima,auto_model_Wrangler,auto_model_X5,auto_model_X6
0,328,48,1000,1406.91,0,466132,53300,0,5,1,...,0,0,0,0,0,0,0,0,0,0
1,228,42,2000,1197.22,5000000,468176,0,0,8,1,...,0,0,0,0,0,0,0,0,0,0
2,134,29,2000,1413.14,5000000,430632,35100,0,7,3,...,0,1,0,0,0,0,0,0,0,0
3,256,41,2000,1415.74,6000000,608117,48900,-62400,5,1,...,0,0,0,0,0,1,0,0,0,0
4,228,44,1000,1583.91,6000000,610706,66000,-46000,20,1,...,0,0,1,0,0,0,0,0,0,0


In [7]:
X.describe()

Unnamed: 0,months_as_customer,age,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,...,auto_model_Pathfinder,auto_model_RAM,auto_model_RSX,auto_model_Silverado,auto_model_TL,auto_model_Tahoe,auto_model_Ultima,auto_model_Wrangler,auto_model_X5,auto_model_X6
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,203.954,38.948,1136.0,1256.40615,1101000.0,501214.488,25126.1,-26793.7,11.644,1.839,...,0.031,0.043,0.012,0.022,0.02,0.024,0.023,0.042,0.023,0.016
std,115.113174,9.140287,611.864673,244.167395,2297407.0,71701.610941,27872.187708,28104.096686,6.951373,1.01888,...,0.173404,0.202959,0.10894,0.146757,0.14007,0.153126,0.149978,0.20069,0.149978,0.125538
min,0.0,19.0,500.0,433.33,-1000000.0,430104.0,0.0,-111100.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,115.75,32.0,500.0,1089.6075,0.0,448404.5,0.0,-51500.0,6.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,199.5,38.0,1000.0,1257.2,0.0,466445.5,0.0,-23250.0,12.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,276.25,44.0,2000.0,1415.695,0.0,603251.0,51025.0,0.0,17.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,479.0,64.0,2000.0,2047.59,10000000.0,620962.0,100500.0,0.0,23.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# Check the balance of our target values
y.value_counts()

N    753
Y    247
Name: fraud_reported, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print ("Training:")
print(y_train.value_counts())
print ("Test:")
print(y_test.value_counts())
#print("Training:",Counter(y_train['fraud_reported']))
#print("Test:",Counter(y_test['fraud_reported']))


Training:
N    570
Y    180
Name: fraud_reported, dtype: int64
Test:
N    183
Y     67
Name: fraud_reported, dtype: int64


# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [10]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_forest_model = BalancedRandomForestClassifier(n_estimators = 100, random_state =0)
random_forest_model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=0)

In [11]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = random_forest_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7143381453388793

In [12]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[144,  39],
       [ 24,  43]], dtype=int64)

In [13]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          N       0.86      0.79      0.64      0.82      0.71      0.51       183
          Y       0.52      0.64      0.79      0.58      0.71      0.50        67

avg / total       0.77      0.75      0.68      0.76      0.71      0.51       250



In [14]:
# List the features sorted in descending order by feature importance
feature_names = X.columns
sorted(zip(random_forest_model.feature_importances_, feature_names), reverse=True)

[(0.0719628226977008, 'incident_severity_Major Damage'),
 (0.03368869568587793, 'vehicle_claim'),
 (0.03107541708130852, 'injury_claim'),
 (0.029026558408716294, 'total_claim_amount'),
 (0.028191784553727336, 'months_as_customer'),
 (0.025604135797680786, 'incident_severity_Minor Damage'),
 (0.025489589869237095, 'policy_annual_premium'),
 (0.02480897070319942, 'insured_zip'),
 (0.023904187734245158, 'property_claim'),
 (0.02310786249301669, 'incident_severity_Total Loss'),
 (0.02056643995802251, 'age'),
 (0.02006880216235862, 'incident_hour_of_the_day'),
 (0.018503652502035218, 'capital-loss'),
 (0.01846448220216106, 'auto_year'),
 (0.013264221991369623, 'capital-gains'),
 (0.012282680107519914, 'witnesses'),
 (0.011214425404470445, 'umbrella_limit'),
 (0.01070906758085463, 'bodily_injuries'),
 (0.00915602668326483, 'policy_csl_500/1000'),
 (0.009026099321385121, 'policy_deductable'),
 (0.008246904724174792, 'collision_type_?'),
 (0.006892881364535372, 'insured_sex_MALE'),
 (0.0067000

### Easy Ensemble AdaBoost Classifier

In [15]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
easyensemble_model = EasyEnsembleClassifier(n_estimators = 100,random_state=1)
easyensemble_model.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [16]:
# Calculated the balanced accuracy score
y_pred = easyensemble_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6642198841856293

In [17]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[112,  71],
       [ 19,  48]], dtype=int64)

In [18]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          N       0.85      0.61      0.72      0.71      0.66      0.43       183
          Y       0.40      0.72      0.61      0.52      0.66      0.44        67

avg / total       0.73      0.64      0.69      0.66      0.66      0.44       250

