# Format data to be ready to use

In [1]:
### Import and format data ###

import numpy as np
import pandas as pd

# select columns in the desired order
sort_column = ['Month', 'WeekOfMonth', 'DayOfWeek', 'DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed', 'Year', 'PolicyNumber', 'RepNumber', 'PolicyType', 'VehicleCategory', 'BasePolicy', 'Make', 'VehiclePrice', 'AgeOfVehicle', 'Deductible', 'Days_Policy_Accident', 'Days_Policy_Claim', 'PastNumberOfClaims', 'NumberOfSuppliments', 'NumberOfCars', 'AgentType', 'Fault', 'AccidentArea', 'PoliceReportFiled', 'WitnessPresent', 'AddressChange_Claim', 'AgeOfPolicyHolder', 'Sex', 'MaritalStatus', 'DriverRating', 'Age', 'FraudFound_P']

column_drop = ['Month', 'WeekOfMonth', 'DayOfWeek', 'DayOfWeekClaimed', 'MonthClaimed',
                 'WeekOfMonthClaimed', 'Year', 'PolicyNumber', 'RepNumber', 'PolicyType']

# Data for test
df_test = pd.read_excel('fraud_test.xlsx',skiprows=[0])
df_test.loc[df_test['PolicyType'] == 'Sedan - Liability', 'VehicleCategory'] = 'Sedan'

df_test = df_test[sort_column]

df_test = df_test.drop(column_drop,axis=1)

# Data for train 
df  = pd.read_excel('fraud_data.xlsx',sheet_name='fraud_data')
df.loc[df['PolicyType'] == 'Sedan - Liability', 'VehicleCategory'] = 'Sedan'

df = df[sort_column]

df = df.drop(column_drop,axis=1)

In [2]:
### Check which column has unique values differ between df and df_test ###

column = []

for i in df.columns:
    print('{:<30} -> {:<3} (df): {} (df_test)'.format(i,len(df[i].unique()),len(df_test[i].unique())))
    if len(df[i].unique()) != len(df_test[i].unique()):
        column.append(i)
print('column : \n{}'.format(column))

VehicleCategory                -> 3   (df): 3 (df_test)
BasePolicy                     -> 3   (df): 3 (df_test)
Make                           -> 19  (df): 14 (df_test)
VehiclePrice                   -> 6   (df): 6 (df_test)
AgeOfVehicle                   -> 8   (df): 8 (df_test)
Deductible                     -> 4   (df): 3 (df_test)
Days_Policy_Accident           -> 5   (df): 5 (df_test)
Days_Policy_Claim              -> 4   (df): 3 (df_test)
PastNumberOfClaims             -> 4   (df): 4 (df_test)
NumberOfSuppliments            -> 4   (df): 4 (df_test)
NumberOfCars                   -> 5   (df): 3 (df_test)
AgentType                      -> 2   (df): 2 (df_test)
Fault                          -> 2   (df): 2 (df_test)
AccidentArea                   -> 2   (df): 2 (df_test)
PoliceReportFiled              -> 2   (df): 2 (df_test)
WitnessPresent                 -> 2   (df): 2 (df_test)
AddressChange_Claim            -> 5   (df): 4 (df_test)
AgeOfPolicyHolder              -> 9   (df): 9 (

In [3]:
columns = [col for col in column if col not in ['Age', 'FraudFound_P','Deductible']]

print(columns)

['Make', 'Days_Policy_Claim', 'NumberOfCars', 'AddressChange_Claim']


In [4]:
for i in columns:
    # get the unique values in the column for both df and df_test
    unique_values = set(df[i].unique()) - set(df_test[i].unique())
    print('{:<20} -> {}'.format(i,unique_values))
    for j in unique_values:
        name = str(i)+'_'+str(j)
        df_test[name] = 0

Make                 -> {'Ferrari', 'Mecedes', 'Saturn', 'Lexus', 'Porche'}
Days_Policy_Claim    -> {'none'}
NumberOfCars         -> {'5 to 8', 'more than 8'}
AddressChange_Claim  -> {'under 6 months'}


In [5]:
df = pd.get_dummies(df)
X = df.drop(['FraudFound_P'],axis=1)
y = df[['FraudFound_P']]

X_test = df_test.drop(['FraudFound_P'],axis=1)
X_test = pd.get_dummies(X_test)
X_test = X_test[X.columns]

# Try to use ANN model to detect fraud

In [6]:
from tensorflow.keras.models import load_model

### Load ANN model ###

filename = 'ANN.h5'

model_ann =  load_model(filename)

y_pred = model_ann.predict(X_test)

thresholds = [x/100 for x in range(5, 100, 5)]

for threshold in thresholds:
    y_pred_2 = np.where(y_pred > threshold , 1, 0)
    print('threshold : {}'.format(threshold))
    df_test['FraudFound_P'] = y_pred_2
    print(df_test['FraudFound_P'])
    print(df_test['FraudFound_P'].unique())
    print('index (Fraud) : {}'.format(df_test[df_test['FraudFound_P'] == 1].index.tolist()))
    print('-'*40)

threshold : 0.05
0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: FraudFound_P, Length: 1000, dtype: int32
[0]
index (Fraud) : []
----------------------------------------
threshold : 0.1
0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: FraudFound_P, Length: 1000, dtype: int32
[0]
index (Fraud) : []
----------------------------------------
threshold : 0.15
0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: FraudFound_P, Length: 1000, dtype: int32
[0]
index (Fraud) : []
----------------------------------------
threshold : 0.2
0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: FraudFound_P, Length: 1000, dtype: int32
[0]
index (Fraud) : []
----------------------------------------
threshold : 0.25
0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997  

# Try to use kNN model to detect fraud

In [7]:
### Import and format data ###

from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

scaler = MinMaxScaler()

smote = SMOTE(random_state=0)

X_scaled = scaler.fit_transform(X)

X_train, y_train = smote.fit_resample(X_scaled, y.values.ravel())

### Load kNN model ###

filename = 'KNN.pkl'
 
import joblib

model_knn = joblib.load(filename)

model_knn.fit(X_train, y_train)

y_pred_proba = model_knn.predict_proba(X_test)[:, 1]  # predicted probabilities of positive class

thresholds = [x/100 for x in range(5, 100, 5)]

for threshold in thresholds:
    y_pred = (y_pred_proba >= threshold).astype(int)  # predicted class labels
    print('threshold : {}'.format(threshold))
    df_test['FraudFound_P'] = y_pred
    print(df_test['FraudFound_P'])
    print(df_test['FraudFound_P'].unique())
    print('index (Fraud) : {}'.format(df_test[df_test['FraudFound_P'] == 1].index.tolist()))
    print('-'*40)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


threshold : 0.05
0      0
1      1
2      0
3      0
4      0
      ..
995    1
996    0
997    0
998    0
999    0
Name: FraudFound_P, Length: 1000, dtype: int32
[0 1]
index (Fraud) : [1, 134, 167, 545, 647, 699, 745, 815, 861, 909, 948, 956, 991, 995]
----------------------------------------
threshold : 0.1
0      0
1      1
2      0
3      0
4      0
      ..
995    1
996    0
997    0
998    0
999    0
Name: FraudFound_P, Length: 1000, dtype: int32
[0 1]
index (Fraud) : [1, 134, 167, 545, 647, 699, 745, 815, 861, 909, 948, 956, 991, 995]
----------------------------------------
threshold : 0.15
0      0
1      1
2      0
3      0
4      0
      ..
995    1
996    0
997    0
998    0
999    0
Name: FraudFound_P, Length: 1000, dtype: int32
[0 1]
index (Fraud) : [1, 134, 167, 545, 647, 699, 745, 815, 861, 909, 948, 956, 991, 995]
----------------------------------------
threshold : 0.2
0      0
1      1
2      0
3      0
4      0
      ..
995    1
996    0
997    0
998    0
999    0
N

From the result above , I get predition from these 2 models

If I choose only one , I believe result from kNN more than ANN because ANN in all threshold that predict all rows be not fraud 

In real world , If there are some cases which are fraudulent claim but I predict them to be not fraud.They will damage our company greatly 

Then I believe kNN for make sure although there are some cases who wrongly predicted but It's more careful 

After that , you may detect again by using other methods