# Insurance Fraud Detection Model
Importing Libraries needed 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv('insurance_data.csv')

In [3]:
# Print all column names in the DataFrame
print(df.columns)


Index(['months_as_customer', 'age', 'policy_number', 'policy_bind_date',
       'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital-gains', 'capital-loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported', '_c39'],
      dtype='object')


In [4]:
df.head()


Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported,_c39
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,YES,71610,6510,13020,52080,Saab,92x,2004,Y,
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,?,5070,780,780,3510,Mercedes,E400,2007,Y,
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,...,NO,34650,7700,3850,23100,Dodge,RAM,2007,N,
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y,
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,NO,6500,1300,650,4550,Accura,RSX,2009,N,


# Null Value Detection

In [5]:
df.isnull().sum()

months_as_customer                0
age                               0
policy_number                     0
policy_bind_date                  0
policy_state                      0
policy_csl                        0
policy_deductable                 0
policy_annual_premium             0
umbrella_limit                    0
insured_zip                       0
insured_sex                       0
insured_education_level           0
insured_occupation                0
insured_hobbies                   0
insured_relationship              0
capital-gains                     0
capital-loss                      0
incident_date                     0
incident_type                     0
collision_type                    0
incident_severity                 0
authorities_contacted            91
incident_state                    0
incident_city                     0
incident_location                 0
incident_hour_of_the_day          0
number_of_vehicles_involved       0
property_damage             

we are dropping _c39 as it has over 1000 missing values and its insignificent to our dataset 

In [6]:
df.drop('_c39', axis=1,inplace=True)

Authorities_contacted is maybe important to the model so we are going to fill in the null values with most frequent value of the column

In [7]:
mode_value=df['authorities_contacted'].mode()[0]
df.loc[df['authorities_contacted'].isnull(),'authorities_contacted'] =mode_value

In [8]:
df.isnull().sum()

months_as_customer             0
age                            0
policy_number                  0
policy_bind_date               0
policy_state                   0
policy_csl                     0
policy_deductable              0
policy_annual_premium          0
umbrella_limit                 0
insured_zip                    0
insured_sex                    0
insured_education_level        0
insured_occupation             0
insured_hobbies                0
insured_relationship           0
capital-gains                  0
capital-loss                   0
incident_date                  0
incident_type                  0
collision_type                 0
incident_severity              0
authorities_contacted          0
incident_state                 0
incident_city                  0
incident_location              0
incident_hour_of_the_day       0
number_of_vehicles_involved    0
property_damage                0
bodily_injuries                0
witnesses                      0
police_rep

All the Null Values has been Fixed
 - c_39 is dropped
 - The NULL values of authorities_claimed has been replaced with mode value of the factor 

# Handling Categorical Values 

In [9]:
df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,0,?,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,...,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,1,NO,6500,1300,650,4550,Accura,RSX,2009,N


Now we import one hot encoder and column transformer


In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

The Categorical features are taken and are transformed - Changing everything to numeric values 

We take the Y as 1 and N as 0 in the target column - fraud report 

In [11]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Assuming 'df' contains the necessary columns including 'incident_date' and 'policy_bind_date'

# Convert 'policy_bind_date' and 'incident_date' to datetime if they aren't already
df['policy_bind_date'] = pd.to_datetime(df['policy_bind_date'], errors='coerce')
df['incident_date'] = pd.to_datetime(df['incident_date'], errors='coerce')

# Extract date-related features for encoding
df['policy_bind_year'] = df['policy_bind_date'].dt.year.astype(str)
df['policy_bind_month'] = df['policy_bind_date'].dt.month.astype(str)
df['policy_bind_day'] = df['policy_bind_date'].dt.day.astype(str)
df['policy_bind_hour'] = df['policy_bind_date'].dt.hour.astype(str)
df['policy_bind_minute'] = df['policy_bind_date'].dt.minute.astype(str)
df['policy_bind_second'] = df['policy_bind_date'].dt.second.astype(str)

df['incident_year'] = df['incident_date'].dt.year.astype(str)
df['incident_month'] = df['incident_date'].dt.month.astype(str)
df['incident_day'] = df['incident_date'].dt.day.astype(str)
df['incident_hour'] = df['incident_date'].dt.hour.astype(str)
df['incident_minute'] = df['incident_date'].dt.minute.astype(str)
df['incident_second'] = df['incident_date'].dt.second.astype(str)

# List of categorical features to be one-hot encoded (including newly extracted date features)
categorical_features = [
    'policy_state', 'insured_sex', 'insured_education_level', 'insured_occupation',
    'insured_hobbies', 'insured_relationship', 'incident_type', 'collision_type',
    'incident_severity', 'authorities_contacted', 'incident_state', 'incident_city',
    'incident_location', 'property_damage', 'police_report_available', 'auto_make', 'auto_model',
    # Date-related columns
    'policy_bind_year', 'policy_bind_month', 'policy_bind_day', 'policy_bind_hour', 'policy_bind_minute', 'policy_bind_second',
    'incident_year', 'incident_month', 'incident_day', 'incident_hour', 'incident_minute', 'incident_second'
]

# Define the ColumnTransformer with OneHotEncoder for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('one_hot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # Keep the remaining columns as they are
)

# Apply the transformations to the DataFrame
# Assuming 'df' is your DataFrame and it includes the target column 'fraud_reported'
X = df.drop(columns=['fraud_reported'])
y = df['fraud_reported'].map({'Y': 1, 'N': 0})  # Convert target to binary

# Fit and transform the features
X_transformed = preprocessor.fit_transform(X)

# Retrieve the one-hot encoded feature names
one_hot_feature_names = preprocessor.named_transformers_['one_hot'].get_feature_names_out(categorical_features)

# Combine with the names of the non-transformed features
non_transformed_features = [col for col in X.columns if col not in categorical_features]
all_feature_names = list(one_hot_feature_names) + non_transformed_features

# Create a DataFrame with the transformed data
df_transformed_X = pd.DataFrame(X_transformed, columns=all_feature_names)

# Display the first few rows of the transformed DataFrame
print(df_transformed_X.head())


  policy_state_IL policy_state_IN policy_state_OH insured_sex_FEMALE  \
0             0.0             0.0             1.0                0.0   
1             0.0             1.0             0.0                0.0   
2             0.0             0.0             1.0                1.0   
3             1.0             0.0             0.0                1.0   
4             1.0             0.0             0.0                0.0   

  insured_sex_MALE insured_education_level_Associate  \
0              1.0                               0.0   
1              1.0                               0.0   
2              0.0                               0.0   
3              0.0                               0.0   
4              1.0                               1.0   

  insured_education_level_College insured_education_level_High School  \
0                             0.0                                 0.0   
1                             0.0                                 0.0   
2          

Important Note : What we have done above is extract all the info from the dates in the dataset now we have to drop the dates as they are not in readable format

In [12]:
print(df.columns)

Index(['months_as_customer', 'age', 'policy_number', 'policy_bind_date',
       'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital-gains', 'capital-loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported', 'policy_bind_year',
       'policy_bind_month', 'policy_bind_day', 'policy_bind_hour',
       'policy_bind_minute', 'policy_bind_second', 'incident_year',
       'incident_month',

Dropping the date columns from the dataframe

In [13]:
# Drop the original date columns
df_transformed_X = df_transformed_X.drop(columns=['incident_date', 'policy_bind_date'])

# Check the updated DataFrame
print(df_transformed_X.head())


  policy_state_IL policy_state_IN policy_state_OH insured_sex_FEMALE  \
0             0.0             0.0             1.0                0.0   
1             0.0             1.0             0.0                0.0   
2             0.0             0.0             1.0                1.0   
3             1.0             0.0             0.0                1.0   
4             1.0             0.0             0.0                0.0   

  insured_sex_MALE insured_education_level_Associate  \
0              1.0                               0.0   
1              1.0                               0.0   
2              0.0                               0.0   
3              0.0                               0.0   
4              1.0                               1.0   

  insured_education_level_College insured_education_level_High School  \
0                             0.0                                 0.0   
1                             0.0                                 0.0   
2          

In [14]:
print(df.columns)

Index(['months_as_customer', 'age', 'policy_number', 'policy_bind_date',
       'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital-gains', 'capital-loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported', 'policy_bind_year',
       'policy_bind_month', 'policy_bind_day', 'policy_bind_hour',
       'policy_bind_minute', 'policy_bind_second', 'incident_year',
       'incident_month',

Categorical values have been handles and the datas have been dealt with

Now we are checking the TRANSFORMED X COLUMNS 

In [15]:
print(df_transformed_X.columns)


Index(['policy_state_IL', 'policy_state_IN', 'policy_state_OH',
       'insured_sex_FEMALE', 'insured_sex_MALE',
       'insured_education_level_Associate', 'insured_education_level_College',
       'insured_education_level_High School', 'insured_education_level_JD',
       'insured_education_level_MD',
       ...
       'capital-loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'bodily_injuries', 'witnesses',
       'total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim',
       'auto_year'],
      dtype='object', length=1270)


Adding fraud_reported to the dataframe

In [16]:
# Adding X - fraud_reported back in the dataframe
df_transformed_X['fraud_reported'] = df['fraud_reported']


Splitting Feature X and Target Y

In [17]:
X = df_transformed_X.drop(columns=['fraud_reported'])
y = df_transformed_X['fraud_reported']

# Splitting the dataframe 

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

After splitting the Dataframe he have found some objects so we are going to solve that error

In [19]:
print(X_train.select_dtypes(include=['object']).columns)

Index(['policy_state_IL', 'policy_state_IN', 'policy_state_OH',
       'insured_sex_FEMALE', 'insured_sex_MALE',
       'insured_education_level_Associate', 'insured_education_level_College',
       'insured_education_level_High School', 'insured_education_level_JD',
       'insured_education_level_MD',
       ...
       'capital-loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'bodily_injuries', 'witnesses',
       'total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim',
       'auto_year'],
      dtype='object', length=1270)


converting non-numeric columns to numerical columns 

In [20]:
# For example, if 'policy_csl' contains values like '500/1000'
X_train['policy_csl'] = X_train['policy_csl'].apply(lambda x: float(x.split('/')[0]) if isinstance(x, str) else x)
X_test['policy_csl'] = X_test['policy_csl'].apply(lambda x: float(x.split('/')[0]) if isinstance(x, str) else x)


# Feature Scaling the entire dataset

In [21]:
print(X_train.dtypes)  # Ensure all columns are now numeric types

policy_state_IL       object
policy_state_IN       object
policy_state_OH       object
insured_sex_FEMALE    object
insured_sex_MALE      object
                       ...  
total_claim_amount    object
injury_claim          object
property_claim        object
vehicle_claim         object
auto_year             object
Length: 1270, dtype: object


Converting the objects to non numeric values

In [22]:
# Convert all object columns to numeric
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')


Checking weather any null variable are there again in the converted df

# Check for any NaN values in the data
print(X_train.isnull().sum())


If in case any values are null we are replacing them with the mean of the cloumn 

In [23]:
# Fill NaN values with the mean of the column
X_train.fillna(X_train.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)


Applying StandardScaler 

In [24]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [25]:
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Convert string labels to numerical labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# List of classifiers to test
classifiers = [
    LogisticRegression(max_iter=1000, random_state=42),
    SVC(class_weight='balanced', random_state=42),
    RandomForestClassifier(class_weight='balanced', random_state=42),
    GradientBoostingClassifier(random_state=42),
    KNeighborsClassifier(),
    GaussianNB(),
    XGBClassifier(eval_metric='mlogloss', use_label_encoder=False, random_state=42)
]

# Train and evaluate each model
results = {}
for clf in classifiers:
    model_name = clf.__class__.__name__
    clf.fit(X_train_scaled, y_train_encoded)  # Train the model
    y_pred = clf.predict(X_test_scaled)  # Make predictions
    accuracy = accuracy_score(y_test_encoded, y_pred)  # Accuracy
    report = classification_report(y_test_encoded, y_pred, output_dict=True)  # Precision, recall, f1-score

    results[model_name] = {
        'accuracy': accuracy,
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1-score': report['weighted avg']['f1-score']
    }

# Display the results
results_df = pd.DataFrame(results).T.sort_values(by='f1-score', ascending=False)
print(results_df)


Parameters: { "use_label_encoder" } are not used.



                            accuracy  precision  recall  f1-score
GradientBoostingClassifier     0.810   0.814707   0.810  0.812005
XGBClassifier                  0.780   0.771478   0.780  0.774326
SVC                            0.730   0.686981   0.730  0.663696
KNeighborsClassifier           0.705   0.645694   0.705  0.652051
RandomForestClassifier         0.725   0.671711   0.725  0.648278
LogisticRegression             0.730   0.690563   0.730  0.644655
GaussianNB                     0.280   0.801005   0.280  0.129026
