In [1]:
import numpy as np 
import pandas as pd


In [2]:
df = pd.read_excel('IFD.xlsx')

In [3]:
#incident_severity can be seen as ordinal
#code them in order of severity 

df['incident_severity'] = df['incident_severity'].map({"Trivial Damage":0,
                                                       "Minor Damage":1,
                                                       "Major Damage":2,
                                                       "Total Loss":3
                                                      }).astype("int32")
df.incident_severity.value_counts()

1    354
3    280
2    276
0     90
Name: incident_severity, dtype: int64

In [4]:
#continuous variables

cont_var =['age','incident_hour_of_the_day',
           'number_of_vehicles_involved','total_claim_amount',
           'injury_claim','property_claim','vehicle_claim',
           'months_as_customer','policy_annual_premium','policy_deductable',
           'umbrella_limit','capital_gains','capital_loss', 
           'auto_year','witnesses','bodily_injuries','policy_bind_year','incident_severity']

In [5]:
#ordinal variables

ord_var = ['policy_deductable','witnesses','bodily_injuries','incident_severity']

In [6]:
#nominal variables

nom_var = ['incident_date',
 'police_report_available',
 'incident_city',
 'property_damage',
 'policy_csl',
 'insured_occupation',
 'collision_type',
 'incident_type',
 'incident_location',
 'fraud_reported',
 'incident_month',
 'insured_education_level',
 'auto_model',
 'incident_state',
 'policy_bind_date',
 'policy_number',
 'insured_zip',
 'insured_hobbies',
 'insured_sex',
 'insured_relationship',
 'auto_make',
 'policy_state',
 'authorities_contacted']

## Exploratory Data Analysis:

### Dependant variable:

### Losses by claims

Here, I define loss as simply money going out from the insurance company. Source of money coming in, on the other hand, are premiums. Although we know premiums and claims are not the only source of money going in or out of an insurance company, these 2 variables are used since they are the only information, we have from this data set. Typically, other source of money movement maybe investments made by the insurance company, for instance.

In [8]:
#extract out the year as 

df['policy_bind_year']= pd.DatetimeIndex(df['policy_bind_date']).year

In [9]:
#create a variable that measure how much claims - how much premiums was paid by a client to indeicate a loss/profit

df['loss_by_claims'] = df['total_claim_amount'] - (df['policy_annual_premium'] * (2015 - df['policy_bind_year']))

## Pre-processing:

In [10]:
#DV numerical code
df['fraud_reported'] = df['fraud_reported'].map({"Y":1, "N":0})
df['fraud_reported']

0      1
1      1
2      0
3      1
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: fraud_reported, Length: 1000, dtype: int64

In [11]:
df['insured_sex'] = df['insured_sex'].map({"FEMALE":0,"MALE":1})

In [12]:
df['capital_loss']=df['capital_loss']*(-1)

## Partie PYCARET

### Dataset réduit

In [14]:
colo=['incident_severity','insured_hobbies','capital_loss','collision_type',
        'incident_state','policy_annual_premium','loss_by_claims','property_claim','fraud_reported']
df1 = []

for col in colo:
    df1.append(df[col])
    
df1 = pd.DataFrame(df1).T

In [15]:
dataset=df1

In [16]:
data = dataset.sample(frac=0.95, random_state=21)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (950, 9)
Unseen Data For Predictions: (50, 9)


In [17]:
from pycaret.classification import *

In [18]:
dataset.head()

Unnamed: 0,incident_severity,insured_hobbies,capital_loss,collision_type,incident_state,policy_annual_premium,loss_by_claims,property_claim,fraud_reported
0,2,sleeping,0,Side Collision,SC,1406.91,70203.1,13020,1
1,1,reading,0,?,VA,1197.22,-5704.98,780,1
2,1,board-games,0,Rear Collision,NY,1413.14,13452.9,3850,0
3,2,board-games,62400,Front Collision,OH,1415.74,28006.5,6340,1
4,1,board-games,46000,?,NY,1583.91,4916.09,650,0


In [19]:
data['fraud_reported'].value_counts()

0    711
1    239
Name: fraud_reported, dtype: int64

In [20]:
#from imblearn.under_sampling import RandomUnderSampler
#from imblearn.under_sampling import TomekLinks


In [21]:
#RUS=RandomUnderSampler()
#TL=TomekLinks()

In [22]:
classif1 = setup(data = data, target = 'fraud_reported',session_id=128, fold=10, fix_imbalance=True, normalize=True)

IntProgress(value=0, description='Processing: ', max=3)

Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
incident_severity,Categorical
insured_hobbies,Categorical
capital_loss,Numeric
collision_type,Categorical
incident_state,Categorical
policy_annual_premium,Numeric
loss_by_claims,Numeric
property_claim,Numeric
fraud_reported,Label


KeyboardInterrupt: Interrupted by user

In [None]:
#classif1 = setup(data = data, target = 'fraud_reported', normalize=True,session_id=128, fold=10, fix_imbalance=True) 

In [None]:
data

In [None]:
best_model = compare_models()

In [None]:
models()


In [None]:
tuned_catboost = tune_model(create_model('catboost'),optimize='F1',n_iter=10)

In [None]:
tuned_lr = tune_model(create_model('lr'),optimize='F1',n_iter=10)

In [None]:
#trained model object is stored in the variable 'catboost'. 
plot_model(tuned_catboost, plot = 'confusion_matrix')
print(tuned_catboost)

In [None]:
plot_model(tuned_catboost, plot = 'auc')

In [None]:
plot_model(tuned_catboost, plot = 'pr')

In [None]:
plot_model(tuned_catboost, plot = 'feature_all')

In [None]:
evaluate_model(tuned_catboost)

In [None]:
predict_model(tuned_catboost);

In [None]:
predict_model(tuned_lr);

In [None]:
final_catboost = finalize_model(tuned_catboost)
unseen_predictions_catboost = predict_model(final_catboost, data=data_unseen)
unseen_predictions_catboost.head(10)

In [None]:
from pycaret.utils import check_metric
check_metric(unseen_predictions_catboost['fraud_reported'], unseen_predictions_catboost['Label'], metric = 'F1')

In [None]:
#import pickle

#pickle.dump(tuned_catboost, open('modelCB.pkl','wb'))

#model = pickle.load(open('modelCB.pkl','rb'))
#model

In [None]:
#save_model(tuned_catboost, 'modelCB')

In [None]:
data

In [None]:
# save transformation pipeline and model 
save_model(final_catboost, model_name ='modelCB')
#save_model(lr, model_name = 'deployment_28042020')

In [None]:
data