<a href="https://colab.research.google.com/github/ozkalt/Car-Insurance-Fraud-Detection/blob/main/insurance_fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
!pip install -U imbalanced-learn
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [23]:
# insurance-fraud-detection-pipeline: 02_pipeline_training.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
warnings.filterwarnings("ignore")

In [31]:
# 1. Load Data
data = pd.read_csv("data/Simulated_Car_Insurance_Data.csv")

In [32]:
data.shape

(10000, 32)

In [33]:
data.head()

Unnamed: 0,vehicle_type,vehicle_make,vehicle_model,vehicle_year,vehicle_age,vehicle_condition,vehicle_price,engine_type,age,gender,...,police_report,repair_estimate,actual_repair_cost,submission_channel,payment_channel,customer_contacted,agent_id,agent_city,fraud_reported,net_premium
0,SUV,Honda,Accord,2008,17,Used,22511.65,Electric,44,Male,...,1,5870.9,4760.98,In-Person,Check,1,A754,South,0,2084.19
1,Sedan,Mercedes,GLA,2001,24,Damaged,14595.27,Petrol,30,Female,...,1,2969.04,5162.09,Online,Cash,1,A214,East,0,1450.88
2,Sedan,BMW,3 Series,2007,18,New,6948.39,Petrol,53,Female,...,0,6539.25,2059.1,Phone,Bank Transfer,1,A125,North,0,779.61
3,Convertible,Mercedes,GLA,2019,6,Used,29836.16,Electric,30,Female,...,0,2991.44,4163.21,Online,Check,0,A859,Central,0,1262.49
4,Sedan,Mercedes,C-Class,2001,24,Used,21314.5,Hybrid,58,Female,...,1,5781.6,3822.91,In-Person,Bank Transfer,1,A381,South,0,1846.45


In [35]:
data.columns

Index(['vehicle_type', 'vehicle_make', 'vehicle_model', 'vehicle_year',
       'vehicle_age', 'vehicle_condition', 'vehicle_price', 'engine_type',
       'age', 'gender', 'occupation', 'region', 'driving_experience',
       'claims_history', 'policy_tenure', 'policy_type', 'claim_id',
       'claim_date', 'claim_amount', 'claim_reason', 'damage_type',
       'reported_delay', 'police_report', 'repair_estimate',
       'actual_repair_cost', 'submission_channel', 'payment_channel',
       'customer_contacted', 'agent_id', 'agent_city', 'fraud_reported',
       'net_premium'],
      dtype='object')

In [36]:
data['vehicle_make_model'] = data['vehicle_make'] + '_' + data['vehicle_model']
data['vehicle_make_model'].value_counts()

Unnamed: 0_level_0,count
vehicle_make_model,Unnamed: 1_level_1
Honda_Civic,586
Mercedes_C-Class,586
Hyundai_Elantra,574
Ford_Escape,573
Toyota_Camry,573
Toyota_RAV4,567
Ford_F-150,563
Ford_Focus,556
Hyundai_Sonata,555
Honda_Accord,553


In [39]:
data['fraud_reported'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
fraud_reported,Unnamed: 1_level_1
0,0.9695
1,0.0305


In [40]:
# 2. Feature & Target Definition
y = data['fraud_reported']
X = data.drop(columns=['fraud_reported', 'claim_id'])

In [41]:
# 3. Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y,
                                                    test_size=0.2,
                                                    random_state=42)

In [42]:
# 4. Column Types
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [43]:
# 5. Preprocessing Pipelines
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

In [44]:
# 6. Models to Train
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
    "NeuralNetwork": MLPClassifier(random_state=42, max_iter=300),
    "CatBoost": CatBoostClassifier(iterations=500,
                                   learning_rate=0.1,
                                   depth=6,
                                   eval_metric='AUC',
                                   random_seed=42,
                                   verbose=100 )
}

In [45]:
# 7. Training and Evaluation Loop
results = []

for name, model in models.items():
    print(f"\nTraining model: {name}")

    pipeline = ImbPipeline([
        ('preprocessing', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', model)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1] if hasattr(pipeline, "predict_proba") else None

    report = classification_report(y_test, y_pred, output_dict=True)
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None

    results.append({
        "Model": name,
        "Precision": report['1']['precision'],
        "Recall": report['1']['recall'],
        "F1-score": report['1']['f1-score'],
        "ROC-AUC": auc
    })

    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    if auc:
        print(f"ROC-AUC Score: {auc:.4f}")


Training model: RandomForest
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1939
           1       0.00      0.00      0.00        61

    accuracy                           0.97      2000
   macro avg       0.48      0.50      0.49      2000
weighted avg       0.94      0.97      0.95      2000

Confusion Matrix:
 [[1939    0]
 [  61    0]]
ROC-AUC Score: 0.5144

Training model: XGBoost
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1939
           1       0.00      0.00      0.00        61

    accuracy                           0.97      2000
   macro avg       0.48      0.50      0.49      2000
weighted avg       0.94      0.97      0.95      2000

Confusion Matrix:
 [[1938    1]
 [  61    0]]
ROC-AUC Score: 0.5152

Training model: LightGBM
[LightGBM] [Info] Number of positive: 7756, number of negative: 7756
[LightGBM] [Info] Auto-choosing row-wise multi-threading, th

In [46]:
# 8. Summary
print("\nModel Comparison Summary:")
summary_df = pd.DataFrame(results)
print(summary_df.sort_values("F1-score", ascending=False))


Model Comparison Summary:
           Model  Precision  Recall  F1-score   ROC-AUC
0   RandomForest        0.0     0.0       0.0  0.514424
1        XGBoost        0.0     0.0       0.0  0.515189
2       LightGBM        0.0     0.0       0.0  0.527811
3  NeuralNetwork        0.0     0.0       0.0  0.506286
4       CatBoost        0.0     0.0       0.0  0.522967
