# Importing Required Libraries

In [1]:
import pandas as pd
import pandas_profiling as pp
import plotly.express as px
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

# Load and view dataset

In [2]:
def data_extraction(file_name):
    if os.path.isfile(file_name):
        df = pd.read_csv(file_name)
        return df
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_name)

In [4]:
df = data_extraction('./../data/insurance_claims.csv')
df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,0,?,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,...,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,1,NO,6500,1300,650,4550,Accura,RSX,2009,N


In [5]:
df.shape

(1000, 39)

In [6]:
df.dtypes

months_as_customer               int64
age                              int64
policy_number                    int64
policy_bind_date                object
policy_state                    object
policy_csl                      object
policy_deductable                int64
policy_annual_premium          float64
umbrella_limit                   int64
insured_zip                      int64
insured_sex                     object
insured_education_level         object
insured_occupation              object
insured_hobbies                 object
insured_relationship            object
capital-gains                    int64
capital-loss                     int64
incident_date                   object
incident_type                   object
collision_type                  object
incident_severity               object
authorities_contacted           object
incident_state                  object
incident_city                   object
incident_location               object
incident_hour_of_the_day 

In [7]:
df.columns

Index(['months_as_customer', 'age', 'policy_number', 'policy_bind_date',
       'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital-gains', 'capital-loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported'],
      dtype='object')

In [8]:
# Viewing categorical columns

cat_cols = [col for col in df.columns if col not in df.describe().columns]

df[cat_cols].nunique().sort_values(ascending=False)


incident_location          1000
policy_bind_date            951
incident_date                60
auto_model                   39
insured_hobbies              20
auto_make                    14
insured_occupation           14
insured_education_level       7
incident_city                 7
incident_state                7
insured_relationship          6
authorities_contacted         5
collision_type                4
incident_severity             4
incident_type                 4
policy_state                  3
property_damage               3
police_report_available       3
policy_csl                    3
insured_sex                   2
fraud_reported                2
dtype: int64

# Data Visualization

### Pandas profiling to analyze and visualize the dataset

In [195]:
prof = pp.ProfileReport(df, title="Pandas Profiling Report")
prof.to_file('profile_report.html')


Summarize dataset: 100%|██████████| 248/248 [00:41<00:00,  6.01it/s, Completed]                                                 
Generate report structure: 100%|██████████| 1/1 [00:17<00:00, 17.79s/it]
Render HTML: 100%|██████████| 1/1 [00:19<00:00, 19.47s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00,  7.48it/s]


In [196]:
df['fraud_reported'].value_counts()

N    753
Y    247
Name: fraud_reported, dtype: int64

### Visualizing the corelation between different numeric variables

In [197]:
px.imshow(df.corr(),text_auto=True, aspect="auto")

### Description of the numeric variables

In [198]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
months_as_customer,1000.0,203.954,115.1132,0.0,115.75,199.5,276.25,479.0
age,1000.0,38.948,9.140287,19.0,32.0,38.0,44.0,64.0
policy_number,1000.0,546238.6,257063.0,100804.0,335980.25,533135.0,759099.75,999435.0
policy_deductable,1000.0,1136.0,611.8647,500.0,500.0,1000.0,2000.0,2000.0
policy_annual_premium,1000.0,1256.406,244.1674,433.33,1089.6075,1257.2,1415.695,2047.59
umbrella_limit,1000.0,1101000.0,2297407.0,-1000000.0,0.0,0.0,0.0,10000000.0
insured_zip,1000.0,501214.5,71701.61,430104.0,448404.5,466445.5,603251.0,620962.0
capital-gains,1000.0,25126.1,27872.19,0.0,0.0,0.0,51025.0,100500.0
capital-loss,1000.0,-26793.7,28104.1,-111100.0,-51500.0,-23250.0,0.0,0.0
incident_hour_of_the_day,1000.0,11.644,6.951373,0.0,6.0,12.0,17.0,23.0


### Visualization of different categorical variables

##### Histograms of each variable v/s their count w.r.t the ouput are visualized 

In [199]:
fig = px.histogram(df, x="policy_state", color="fraud_reported")
fig.show()

In [200]:
fig = px.histogram(df, x="policy_csl", color="fraud_reported")
fig.show()

In [201]:
fig = px.histogram(df, x="insured_sex", color="fraud_reported")
fig.show()

In [202]:
fig = px.histogram(df, x="insured_education_level", color="fraud_reported")
fig.show()

In [203]:
fig = px.histogram(df, x="insured_occupation", color="fraud_reported")
fig.show()

In [204]:
fig = px.histogram(df, x="insured_hobbies", color="fraud_reported")
fig.show()

In [205]:
fig = px.histogram(df, x="insured_relationship", color="fraud_reported")
fig.show()

In [206]:
fig = px.histogram(df, x="incident_type", color="fraud_reported")
fig.show()

In [207]:
fig = px.histogram(df, x="collision_type", color="fraud_reported")
fig.show()

In [208]:
fig = px.histogram(df, x="incident_severity", color="fraud_reported")
fig.show()

In [209]:
fig = px.histogram(df, x="authorities_contacted", color="fraud_reported")
fig.show()

In [210]:
fig = px.histogram(df, x="incident_state", color="fraud_reported")
fig.show()

In [211]:
fig = px.histogram(df, x="incident_city", color="fraud_reported")
fig.show()

In [212]:
fig = px.histogram(df, x="property_damage", color="fraud_reported")
fig.show()

In [213]:
fig = px.histogram(df, x="number_of_vehicles_involved", color="fraud_reported")
fig.show()

In [214]:
fig = px.histogram(df, x="bodily_injuries", color="fraud_reported")
fig.show()

In [215]:
fig = px.histogram(df, x="witnesses", color="fraud_reported")
fig.show()

In [216]:
fig = px.histogram(df, x="auto_year", color="fraud_reported")
fig.show()

# Data preperation

### Dropping categorical variables which have too many unique values (>=14)

In [9]:
to_drop = ['policy_number','policy_bind_date','policy_state','insured_zip','incident_location','incident_date','auto_make','auto_model','insured_occupation']
df.drop(to_drop, inplace = True, axis = 1)
df.head()

Unnamed: 0,months_as_customer,age,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_sex,insured_education_level,insured_hobbies,insured_relationship,...,property_damage,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_year,fraud_reported
0,328,48,250/500,1000,1406.91,0,MALE,MD,sleeping,husband,...,YES,1,2,YES,71610,6510,13020,52080,2004,Y
1,228,42,250/500,2000,1197.22,5000000,MALE,MD,reading,other-relative,...,?,0,0,?,5070,780,780,3510,2007,Y
2,134,29,100/300,2000,1413.14,5000000,FEMALE,PhD,board-games,own-child,...,NO,2,3,NO,34650,7700,3850,23100,2007,N
3,256,41,250/500,2000,1415.74,6000000,FEMALE,PhD,board-games,unmarried,...,?,1,2,NO,63400,6340,6340,50720,2014,Y
4,228,44,500/1000,1000,1583.91,6000000,MALE,Associate,board-games,unmarried,...,NO,0,1,NO,6500,1300,650,4550,2009,N


### Dropping numeric variables having high correlation

In [10]:
df.drop(columns = ['age', 'total_claim_amount'], inplace = True, axis = 1)
df.head()

Unnamed: 0,months_as_customer,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_sex,insured_education_level,insured_hobbies,insured_relationship,capital-gains,...,number_of_vehicles_involved,property_damage,bodily_injuries,witnesses,police_report_available,injury_claim,property_claim,vehicle_claim,auto_year,fraud_reported
0,328,250/500,1000,1406.91,0,MALE,MD,sleeping,husband,53300,...,1,YES,1,2,YES,6510,13020,52080,2004,Y
1,228,250/500,2000,1197.22,5000000,MALE,MD,reading,other-relative,0,...,1,?,0,0,?,780,780,3510,2007,Y
2,134,100/300,2000,1413.14,5000000,FEMALE,PhD,board-games,own-child,35100,...,3,NO,2,3,NO,7700,3850,23100,2007,N
3,256,250/500,2000,1415.74,6000000,FEMALE,PhD,board-games,unmarried,48900,...,1,?,1,2,NO,6340,6340,50720,2014,Y
4,228,500/1000,1000,1583.91,6000000,MALE,Associate,board-games,unmarried,66000,...,1,NO,0,1,NO,1300,650,4550,2009,N


### 'Insured Hobbies' column has high positives for chess and crossfit. Hence these two values are kept and the other values are changed as 'others'

In [11]:
df['insured_hobbies']=df['insured_hobbies'].apply(lambda x :'Other' if x!='chess' and x!='cross-fit' else x)

In [12]:
df['fraud_reported'].replace(to_replace='Y', value=1, inplace=True)
df['fraud_reported'].replace(to_replace='N',  value=0, inplace=True)

df.head()

Unnamed: 0,months_as_customer,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_sex,insured_education_level,insured_hobbies,insured_relationship,capital-gains,...,number_of_vehicles_involved,property_damage,bodily_injuries,witnesses,police_report_available,injury_claim,property_claim,vehicle_claim,auto_year,fraud_reported
0,328,250/500,1000,1406.91,0,MALE,MD,Other,husband,53300,...,1,YES,1,2,YES,6510,13020,52080,2004,1
1,228,250/500,2000,1197.22,5000000,MALE,MD,Other,other-relative,0,...,1,?,0,0,?,780,780,3510,2007,1
2,134,100/300,2000,1413.14,5000000,FEMALE,PhD,Other,own-child,35100,...,3,NO,2,3,NO,7700,3850,23100,2007,0
3,256,250/500,2000,1415.74,6000000,FEMALE,PhD,Other,unmarried,48900,...,1,?,1,2,NO,6340,6340,50720,2014,1
4,228,500/1000,1000,1583.91,6000000,MALE,Associate,Other,unmarried,66000,...,1,NO,0,1,NO,1300,650,4550,2009,0


In [13]:
df.columns

Index(['months_as_customer', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_sex',
       'insured_education_level', 'insured_hobbies', 'insured_relationship',
       'capital-gains', 'capital-loss', 'incident_type', 'collision_type',
       'incident_severity', 'authorities_contacted', 'incident_state',
       'incident_city', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'injury_claim',
       'property_claim', 'vehicle_claim', 'auto_year', 'fraud_reported'],
      dtype='object')

### Removing rows with peculiar values

In [14]:
df.drop(df[df['umbrella_limit'] < 0].index, inplace = True)

### Replacing values with '?'

In [15]:
df = df.replace('?',np.NaN)

In [16]:
df.isna().sum()

months_as_customer               0
policy_csl                       0
policy_deductable                0
policy_annual_premium            0
umbrella_limit                   0
insured_sex                      0
insured_education_level          0
insured_hobbies                  0
insured_relationship             0
capital-gains                    0
capital-loss                     0
incident_type                    0
collision_type                 178
incident_severity                0
authorities_contacted            0
incident_state                   0
incident_city                    0
incident_hour_of_the_day         0
number_of_vehicles_involved      0
property_damage                360
bodily_injuries                  0
witnesses                        0
police_report_available        342
injury_claim                     0
property_claim                   0
vehicle_claim                    0
auto_year                        0
fraud_reported                   0
dtype: int64

In [17]:
df['collision_type'].fillna(df['collision_type'].mode()[0], inplace = True)

df['property_damage'].fillna('NO', inplace = True)

df['police_report_available'].fillna('NO', inplace = True)

df.isnull().any().any()

False

### One hot encoding categorical variables

In [18]:
cat_df = pd.get_dummies(df[[
    'policy_csl', 
    'insured_sex', 
    'insured_education_level',
    'insured_hobbies', 
    'insured_relationship',
    'incident_type', 
    'incident_severity',
    'authorities_contacted', 
    'incident_state', 
    'incident_city',
    'collision_type']])

cat_df = cat_df.join(df[[
    'property_damage', 
    'police_report_available']])

cat_df.head()

Unnamed: 0,policy_csl_100/300,policy_csl_250/500,policy_csl_500/1000,insured_sex_FEMALE,insured_sex_MALE,insured_education_level_Associate,insured_education_level_College,insured_education_level_High School,insured_education_level_JD,insured_education_level_MD,...,incident_city_Hillsdale,incident_city_Northbend,incident_city_Northbrook,incident_city_Riverwood,incident_city_Springfield,collision_type_Front Collision,collision_type_Rear Collision,collision_type_Side Collision,property_damage,police_report_available
0,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,YES,YES
1,0,1,0,0,1,0,0,0,0,1,...,0,0,0,1,0,0,1,0,NO,NO
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,NO,NO
3,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,NO,NO
4,0,0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,NO,NO


In [19]:
# Converting to binary values

cat_df['property_damage'].replace(to_replace='YES', value=1, inplace=True)
cat_df['property_damage'].replace(to_replace='NO', value=0, inplace=True)
cat_df['police_report_available'].replace(to_replace='YES', value=1, inplace=True)
cat_df['police_report_available'].replace(to_replace='NO', value=0, inplace=True)

cat_df.head(10) 


Unnamed: 0,policy_csl_100/300,policy_csl_250/500,policy_csl_500/1000,insured_sex_FEMALE,insured_sex_MALE,insured_education_level_Associate,insured_education_level_College,insured_education_level_High School,insured_education_level_JD,insured_education_level_MD,...,incident_city_Hillsdale,incident_city_Northbend,incident_city_Northbrook,incident_city_Riverwood,incident_city_Springfield,collision_type_Front Collision,collision_type_Rear Collision,collision_type_Side Collision,property_damage,police_report_available
0,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,1,1
1,0,1,0,0,1,0,0,0,0,1,...,0,0,0,1,0,0,1,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
7,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
8,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
9,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0


In [20]:
df_clean = pd.concat([cat_df, df._get_numeric_data()], axis=1)  # joining numeric columns
df_clean.head()

Unnamed: 0,policy_csl_100/300,policy_csl_250/500,policy_csl_500/1000,insured_sex_FEMALE,insured_sex_MALE,insured_education_level_Associate,insured_education_level_College,insured_education_level_High School,insured_education_level_JD,insured_education_level_MD,...,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,injury_claim,property_claim,vehicle_claim,auto_year,fraud_reported
0,0,1,0,0,1,0,0,0,0,1,...,0,5,1,1,2,6510,13020,52080,2004,1
1,0,1,0,0,1,0,0,0,0,1,...,0,8,1,0,0,780,780,3510,2007,1
2,1,0,0,1,0,0,0,0,0,0,...,0,7,3,2,3,7700,3850,23100,2007,0
3,0,1,0,1,0,0,0,0,0,0,...,-62400,5,1,1,2,6340,6340,50720,2014,1
4,0,0,1,0,1,1,0,0,0,0,...,-46000,20,1,0,1,1300,650,4550,2009,0


In [21]:
X = df_clean[df_clean.columns.drop('fraud_reported')]
Y = df_clean['fraud_reported']

In [22]:
X_train_df, X_test_df, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=42)

In [23]:
# standardization

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train_df)
X_test = sc.transform(X_test_df)

# Modelling

### Random Forest

In [232]:
model_rfc = BalancedRandomForestClassifier(n_estimators = 100, random_state = 0)

model_rfc.fit(X_train, Y_train)
Y_pred_rf = model_rfc.predict(X_test)

print("Training Accuracy: ", model_rfc.score(X_train, Y_train))
print('Testing Accuarcy: ', model_rfc.score(X_test, Y_test))

print(confusion_matrix(Y_test, Y_pred_rf))
print(classification_report(Y_test, Y_pred_rf))

Training Accuracy:  0.8948685857321652
Testing Accuarcy:  0.84
[[126  26]
 [  6  42]]
              precision    recall  f1-score   support

           0       0.95      0.83      0.89       152
           1       0.62      0.88      0.72        48

    accuracy                           0.84       200
   macro avg       0.79      0.85      0.81       200
weighted avg       0.87      0.84      0.85       200



### XGB Classifier

In [233]:
xgb = XGBClassifier(use_label_encoder=False)
xgb.fit(X_train, Y_train)

Y_pred_xgb = xgb.predict(X_test)

xgb_train_acc = accuracy_score(Y_train, xgb.predict(X_train))
xgb_test_acc = accuracy_score(Y_test, Y_pred_xgb)

print(f"Training accuracy of XgBoost is : {xgb_train_acc}")
print(f"Test accuracy of XgBoost is : {xgb_test_acc}")

print(confusion_matrix(Y_test, Y_pred_xgb))
print(classification_report(Y_test, Y_pred_xgb))

Training accuracy of XgBoost is : 1.0
Test accuracy of XgBoost is : 0.805
[[131  21]
 [ 18  30]]
              precision    recall  f1-score   support

           0       0.88      0.86      0.87       152
           1       0.59      0.62      0.61        48

    accuracy                           0.81       200
   macro avg       0.73      0.74      0.74       200
weighted avg       0.81      0.81      0.81       200



### KNN

In [234]:
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)

y_pred_knn = knn.predict(X_test)

knn_train_acc = accuracy_score(Y_train, knn.predict(X_train))
knn_test_acc = accuracy_score(Y_test, y_pred_knn)

print(f"Training accuracy of KNN is : {knn_train_acc}")
print(f"Test accuracy of KNN is : {knn_test_acc}")

print(confusion_matrix(Y_test, y_pred_knn))
print(classification_report(Y_test, y_pred_knn))

Training accuracy of KNN is : 0.8760951188986232
Test accuracy of KNN is : 0.775
[[140  12]
 [ 33  15]]
              precision    recall  f1-score   support

           0       0.81      0.92      0.86       152
           1       0.56      0.31      0.40        48

    accuracy                           0.78       200
   macro avg       0.68      0.62      0.63       200
weighted avg       0.75      0.78      0.75       200



### Cat Boost

In [235]:
cat = CatBoostClassifier()
cat.fit(X_train, Y_train)

cat_acc = accuracy_score(Y_test, cat.predict(X_test))

Learning rate set to 0.009361
0:	learn: 0.6835765	total: 43ms	remaining: 43s
1:	learn: 0.6736842	total: 46ms	remaining: 22.9s
2:	learn: 0.6643364	total: 67.7ms	remaining: 22.5s
3:	learn: 0.6554416	total: 75.9ms	remaining: 18.9s
4:	learn: 0.6465109	total: 83.1ms	remaining: 16.5s
5:	learn: 0.6380511	total: 90.7ms	remaining: 15s
6:	learn: 0.6293070	total: 97.9ms	remaining: 13.9s
7:	learn: 0.6224801	total: 105ms	remaining: 13s
8:	learn: 0.6139039	total: 111ms	remaining: 12.2s
9:	learn: 0.6073038	total: 116ms	remaining: 11.5s
10:	learn: 0.5997593	total: 120ms	remaining: 10.8s
11:	learn: 0.5923461	total: 124ms	remaining: 10.2s
12:	learn: 0.5853702	total: 128ms	remaining: 9.68s
13:	learn: 0.5781245	total: 131ms	remaining: 9.25s
14:	learn: 0.5713238	total: 135ms	remaining: 8.86s
15:	learn: 0.5646392	total: 138ms	remaining: 8.47s
16:	learn: 0.5582487	total: 141ms	remaining: 8.12s
17:	learn: 0.5516651	total: 143ms	remaining: 7.8s
18:	learn: 0.5453191	total: 146ms	remaining: 7.53s
19:	learn: 0.54

In [236]:
print(f"Training Accuracy of Cat Boost Classifier is {accuracy_score(Y_train, cat.predict(X_train))}")
print(f"Test Accuracy of Cat Boost Classifier is {cat_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(Y_test, cat.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(Y_test, cat.predict(X_test))}")

Training Accuracy of Cat Boost Classifier is 0.9849812265331664
Test Accuracy of Cat Boost Classifier is 0.81 

Confusion Matrix :- 
[[130  22]
 [ 16  32]]

Classification Report :- 
               precision    recall  f1-score   support

           0       0.89      0.86      0.87       152
           1       0.59      0.67      0.63        48

    accuracy                           0.81       200
   macro avg       0.74      0.76      0.75       200
weighted avg       0.82      0.81      0.81       200



### Feature Importance

In [237]:
num = X.select_dtypes(include = np.number)
cat = X.select_dtypes(exclude = np.number)

feaure_list = num.join(cat)

In [238]:
# rfc = RandomForestClassifier(random_state = 42, oob_score = True)

#Define function to view important features
def feature_importance(estimator, features):
    important_features = pd.DataFrame(list(estimator.feature_importances_), columns = ['feature importance'])
    features = pd.DataFrame(list(features), columns = ['Features'])
    global best_features
    best_features = features.join(important_features)
    best_features = best_features.sort_values(by = 'feature importance', ascending = False)
    print(best_features)
    
#Apply function on rfc
feature_importance(model_rfc, feaure_list)

                          Features  feature importance
25  incident_severity_Major Damage            0.157438
12           insured_hobbies_Other            0.061389
65                   vehicle_claim            0.050360
64                  property_claim            0.050088
26  incident_severity_Minor Damage            0.043365
..                             ...                 ...
19  insured_relationship_unmarried            0.003555
11     insured_education_level_PhD            0.003537
37               incident_state_PA            0.003010
22        incident_type_Parked Car            0.002363
36               incident_state_OH            0.001319

[67 rows x 2 columns]


In [239]:
#Subset best_features on unimportant_features

unimportant_features = best_features[best_features['feature importance'] < 0.01]
list_unimportant_features = unimportant_features['Features'].tolist()
X_train_rfc = X_train_df.drop(list_unimportant_features, 1)
X_test_rfc = X_test_df.drop(list_unimportant_features, 1)
feaure_list_rfc = feaure_list.drop(list_unimportant_features, 1)


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only



In [240]:
sc = StandardScaler()
X_train_rfc = sc.fit_transform(X_train_rfc)
X_train_rfc = sc.transform(X_train_rfc)


model_rfc = BalancedRandomForestClassifier(n_estimators = 100, random_state = 0)

model_rfc.fit(X_train_rfc, Y_train)
Y_pred_rf = model_rfc.predict(X_test_rfc)

print("Training Accuracy: ", model_rfc.score(X_train_rfc, Y_train))
print('Testing Accuarcy: ', model_rfc.score(X_test_rfc, Y_test))

print(confusion_matrix(Y_test, Y_pred_rf))
print(classification_report(Y_test, Y_pred_rf))


X does not have valid feature names, but StandardScaler was fitted with feature names



Training Accuracy:  0.8923654568210263
Testing Accuarcy:  0.8
[[128  24]
 [ 16  32]]
              precision    recall  f1-score   support

           0       0.89      0.84      0.86       152
           1       0.57      0.67      0.62        48

    accuracy                           0.80       200
   macro avg       0.73      0.75      0.74       200
weighted avg       0.81      0.80      0.80       200




X has feature names, but BalancedRandomForestClassifier was fitted without feature names


X has feature names, but BalancedRandomForestClassifier was fitted without feature names



### Hyper parameter tuning

In [241]:
grid_params = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3, 5, 7, 10],
    'min_samples_split' : range(2, 10, 1),
    'min_samples_leaf' : range(2, 10, 1)
}

grid_search = GridSearchCV(model_rfc, grid_params, cv = 5, n_jobs = -1, verbose = 1)
grid_search.fit(X_train, Y_train)

Fitting 5 folds for each of 512 candidates, totalling 2560 fits


GridSearchCV(cv=5, estimator=BalancedRandomForestClassifier(random_state=0),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 5, 7, 10],
                         'min_samples_leaf': range(2, 10),
                         'min_samples_split': range(2, 10)},
             verbose=1)

In [242]:
# best parameters and best score

print(grid_search.best_params_)
print(grid_search.best_score_)

{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}
0.8598584905660378


In [243]:
# best estimator 

rf_tuned = grid_search.best_estimator_
Y_pred = rf_tuned.predict(X_test)

In [244]:
rf_tuned_train_acc = accuracy_score(Y_train, rf_tuned.predict(X_train))
rf_tuned_test_acc = accuracy_score(Y_test, Y_pred)

print(f"Training accuracy of Decision Tree is : {rf_tuned_train_acc}")
print(f"Test accuracy of Decision Tree is : {rf_tuned_test_acc}")

print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

Training accuracy of Decision Tree is : 1.0
Test accuracy of Decision Tree is : 0.84
[[126  26]
 [  6  42]]
              precision    recall  f1-score   support

           0       0.95      0.83      0.89       152
           1       0.62      0.88      0.72        48

    accuracy                           0.84       200
   macro avg       0.79      0.85      0.81       200
weighted avg       0.87      0.84      0.85       200



In [26]:
from sklearn.ensemble import RandomForestClassifier

rand_clf = RandomForestClassifier()
rand_clf.fit(X_train, Y_train)

Y_pred = rand_clf.predict(X_test)

rf_tuned_train_acc = accuracy_score(Y_train, rand_clf.predict(X_train))
rf_tuned_test_acc = accuracy_score(Y_test, Y_pred)

print(f"Training accuracy of Decision Tree is : {rf_tuned_train_acc}")
print(f"Test accuracy of Decision Tree is : {rf_tuned_test_acc}")

print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

Training accuracy of Decision Tree is : 1.0
Test accuracy of Decision Tree is : 0.825
[[134  18]
 [ 17  31]]
              precision    recall  f1-score   support

           0       0.89      0.88      0.88       152
           1       0.63      0.65      0.64        48

    accuracy                           0.82       200
   macro avg       0.76      0.76      0.76       200
weighted avg       0.83      0.82      0.83       200



In [27]:
grid_params = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3, 5, 7, 10],
    'min_samples_split' : range(2, 10, 1),
    'min_samples_leaf' : range(2, 10, 1)
}

grid_search = GridSearchCV(rand_clf, grid_params, cv = 5, n_jobs = -1, verbose = 1)
grid_search.fit(X_train, Y_train)

Fitting 5 folds for each of 512 candidates, totalling 2560 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 5, 7, 10],
                         'min_samples_leaf': range(2, 10),
                         'min_samples_split': range(2, 10)},
             verbose=1)

In [38]:
# best parameters and best score

print(grid_search.best_params_)
print(grid_search.best_score_)

{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 8}
0.8410613207547171


In [39]:
# best estimator 

rf_tuned = grid_search.best_estimator_
Y_pred = rf_tuned.predict(X_test)

In [40]:
rf_tuned_train_acc = accuracy_score(Y_train, rf_tuned.predict(X_train))
rf_tuned_test_acc = accuracy_score(Y_test, Y_pred)

print(f"Training accuracy of Decision Tree is : {rf_tuned_train_acc}")
print(f"Test accuracy of Decision Tree is : {rf_tuned_test_acc}")

print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

Training accuracy of Decision Tree is : 0.967459324155194
Test accuracy of Decision Tree is : 0.81
[[133  19]
 [ 19  29]]
              precision    recall  f1-score   support

           0       0.88      0.88      0.88       152
           1       0.60      0.60      0.60        48

    accuracy                           0.81       200
   macro avg       0.74      0.74      0.74       200
weighted avg       0.81      0.81      0.81       200

