In [128]:
import pandas as pd

df = pd.read_csv("insurance_claims.csv")
df.head()



Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,521585,10/17/2014,OH,250/500,1000,1406.91,0,466132,...,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,342868,6/27/2006,IN,250/500,2000,1197.22,5000000,468176,...,0,,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,687698,9/6/2000,OH,100/300,2000,1413.14,5000000,430632,...,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,227811,5/25/1990,IL,250/500,2000,1415.74,6000000,608117,...,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,367455,6/6/2014,IL,500/1000,1000,1583.91,6000000,610706,...,1,NO,6500,1300,650,4550,Accura,RSX,2009,N


In [129]:
df.info()


<class 'pandas.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 39 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   months_as_customer           1000 non-null   int64  
 1   age                          1000 non-null   int64  
 2   policy_number                1000 non-null   int64  
 3   policy_bind_date             1000 non-null   str    
 4   policy_state                 1000 non-null   str    
 5   policy_csl                   1000 non-null   str    
 6   policy_deductable            1000 non-null   int64  
 7   policy_annual_premium        1000 non-null   float64
 8   umbrella_limit               1000 non-null   int64  
 9   insured_zip                  1000 non-null   int64  
 10  insured_sex                  1000 non-null   str    
 11  insured_education_level      1000 non-null   str    
 12  insured_occupation           1000 non-null   str    
 13  insured_hobbies              1

In [130]:
df.shape


(1000, 39)

In [131]:
df.isna().sum()

months_as_customer               0
age                              0
policy_number                    0
policy_bind_date                 0
policy_state                     0
policy_csl                       0
policy_deductable                0
policy_annual_premium            0
umbrella_limit                   0
insured_zip                      0
insured_sex                      0
insured_education_level          0
insured_occupation               0
insured_hobbies                  0
insured_relationship             0
capital-gains                    0
capital-loss                     0
incident_date                    0
incident_type                    0
collision_type                 178
incident_severity                0
authorities_contacted           91
incident_state                   0
incident_city                    0
incident_location                0
incident_hour_of_the_day         0
number_of_vehicles_involved      0
property_damage                360
bodily_injuries     

In [132]:
# Columns with missing values + missing count
missing_cols = (
    df.isna()
      .sum()
      .loc[lambda x: x > 0]
      .sort_values(ascending=False)
      .to_frame("missing_count")
)

missing_cols


Unnamed: 0,missing_count
property_damage,360
police_report_available,343
collision_type,178
authorities_contacted,91


In [134]:
data=df.copy()

In [135]:
# Fill missing values for incident-related categorical columns

data["property_damage"] = data["property_damage"].fillna("Unknown")
data["collision_type"] = data["collision_type"].fillna("Unknown")

data["police_report_available"] = data["police_report_available"].fillna("NO")
data["authorities_contacted"] = data["authorities_contacted"].fillna("None")


In [136]:
import numpy as np

data['customer_maturity'] = (
    (data['age'] - 18) /
    (data['months_as_customer'] / 12)
)

data['customer_maturity'] = data['customer_maturity'].replace(
    [np.inf, -np.inf], np.nan
)


In [137]:
data['capital_gain_loss'] = data['capital-gains'] + data['capital-loss']


In [138]:
data['insured_zip'].nunique()

995

In [139]:
data['incident_location'].nunique()

1000

In [151]:
# Create ratios using existing total_claim_amount
data['injury_claim_ratio']   = data['injury_claim']   / data['total_claim_amount']
data['property_claim_ratio'] = data['property_claim'] / data['total_claim_amount']
data['vehicle_claim_ratio']  = data['vehicle_claim']  / data['total_claim_amount']

# Clean ratios: replace invalid values with 0
data[['injury_claim_ratio',
      'property_claim_ratio',
      'vehicle_claim_ratio']] = (
    data[['injury_claim_ratio',
          'property_claim_ratio',
          'vehicle_claim_ratio']]
    .replace([np.inf, -np.inf], 0)
    .fillna(0)
)


In [153]:
# --- Date/time feature engineering (renamed) ---

# Parse dates
data['incident_date'] = pd.to_datetime(data['incident_date'], errors='coerce')
data['policy_bind_date'] = pd.to_datetime(data['policy_bind_date'], errors='coerce')

# Incident week number of year (ISO week)
data['incident_week_no'] = data['incident_date'].dt.isocalendar().week.astype('Int64')

# Policy → incident difference in days (renamed as requested)
data['responsible_days'] = (data['incident_date'] - data['policy_bind_date']).dt.days

# Incident hour buckets
data['incident_hour_bucket'] = pd.cut(
    data['incident_hour_of_the_day'],
    bins=[0, 6, 12, 18, 24],
    labels=['morning', 'afternoon', 'evening', 'night'],
    right=False,
    include_lowest=True
)

# Handle missing hours
data['incident_hour_bucket'] = data['incident_hour_bucket'].astype('object').fillna('unknown')


In [154]:
# Vehicle age = incident year - auto_year
data['vehicle_age'] = data['incident_date'].dt.year - data['auto_year']

# Optional safety cleanup
data['vehicle_age'] = data['vehicle_age'].clip(lower=0)


In [157]:
# Final list of columns to drop (validated against Kaggle dataset)
drop_cols = [
    # Dates & raw time (already engineered)
    'incident_date',
    'policy_bind_date',
    'incident_hour_of_the_day',

    # Raw claim components (ratios used instead)
    'injury_claim',
    'property_claim',
    'vehicle_claim',

    # Customer raw fields (engineered already)
    'months_as_customer',
    'age',

    # Vehicle details (unnecessary after vehicle_age)
    'auto_make',
    'auto_model',
    'auto_year',

    # Location & identifiers (leakage / high-cardinality)
    'incident_city',
    'incident_state',
    'insured_zip',
    'policy_state',
    'policy_number',

    # Financial noise
    'capital-gains',
    'capital-loss'
]

# Drop columns and save final modeling dataset
df_mod = data.drop(columns=drop_cols).copy()


In [158]:
from sklearn.preprocessing import MinMaxScaler

# Convert target to binary
df_mod['fraud_reported'] = df_mod['fraud_reported'].map({'Y': 1, 'N': 0})

# Separate features and target
X = df_mod.drop(columns=['fraud_reported'])
y = df_mod['fraud_reported']

# One-hot encode categorical features
X_encoded = pd.get_dummies(X, drop_first=True)

# Min-Max scaling
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X_encoded),
    columns=X_encoded.columns,
    index=X_encoded.index
)

# Final dataset (scaled features + target)
df_final = pd.concat([X_scaled, y], axis=1)


In [159]:
## Splitting data

from sklearn.model_selection import train_test_split

# Separate features and target
X = df_final.drop(columns=['fraud_reported'])
y = df_final['fraud_reported']

# Step 1: split out test set (25%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Step 2: split remaining data into train (65%) and validation (10%)
# Remaining = 75% → train = 65/75, val = 10/75
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=(10/75), random_state=42, stratify=y_temp
)

# Sanity check
print("Train:", X_train.shape)
print("Validation:", X_val.shape)
print("Test:", X_test.shape)


Train: (650, 1079)
Validation: (100, 1079)
Test: (250, 1079)


In [161]:
# Common imports (run once)
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

def evaluate_binary(model_name, y_true, y_prob, threshold=0.5):
    y_pred = (y_prob >= threshold).astype(int)
    print(f"\n===== {model_name} | threshold={threshold:.2f} =====")
    print("ROC-AUC:", round(roc_auc_score(y_true, y_prob), 4))
    print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred, digits=4))


In [163]:
# Quick check: how many NaNs are left?
import numpy as np

print("NaNs in X_train:", np.isnan(X_train).sum())
print("NaNs in X_val  :", np.isnan(X_val).sum())
print("NaNs in X_test :", np.isnan(X_test).sum())


NaNs in X_train: policy_deductable               0
policy_annual_premium           0
umbrella_limit                  0
number_of_vehicles_involved     0
bodily_injuries                 0
                               ..
property_damage_YES             0
police_report_available_YES     0
incident_hour_bucket_evening    0
incident_hour_bucket_morning    0
incident_hour_bucket_night      0
Length: 1079, dtype: int64
NaNs in X_val  : policy_deductable               0
policy_annual_premium           0
umbrella_limit                  0
number_of_vehicles_involved     0
bodily_injuries                 0
                               ..
property_damage_YES             0
police_report_available_YES     0
incident_hour_bucket_evening    0
incident_hour_bucket_morning    0
incident_hour_bucket_night      0
Length: 1079, dtype: int64
NaNs in X_test : policy_deductable               0
policy_annual_premium           0
umbrella_limit                  0
number_of_vehicles_involved     0
bodily_inju

In [169]:
X_train["customer_maturity"] = X_train["customer_maturity"].fillna(0)
X_val["customer_maturity"]   = X_val["customer_maturity"].fillna(0)
X_test["customer_maturity"]  = X_test["customer_maturity"].fillna(0)

# verify
print(
    X_train["customer_maturity"].isna().sum(),
    X_val["customer_maturity"].isna().sum(),
    X_test["customer_maturity"].isna().sum()
)


0 0 0


In [170]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=2000, class_weight="balanced", n_jobs=-1)
log_reg.fit(X_train, y_train)

val_prob = log_reg.predict_proba(X_val)[:, 1]
evaluate_binary("Logistic Regression", y_val, val_prob, threshold=0.5)



===== Logistic Regression | threshold=0.50 =====
ROC-AUC: 0.8981
Confusion matrix:
 [[66  9]
 [ 2 23]]
              precision    recall  f1-score   support

           0     0.9706    0.8800    0.9231        75
           1     0.7188    0.9200    0.8070        25

    accuracy                         0.8900       100
   macro avg     0.8447    0.9000    0.8650       100
weighted avg     0.9076    0.8900    0.8941       100





In [165]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(
    max_depth=6,
    min_samples_split=20,
    min_samples_leaf=10,
    class_weight="balanced",
    random_state=42
)
dt.fit(X_train, y_train)

val_prob = dt.predict_proba(X_val)[:, 1]
evaluate_binary("Decision Tree", y_val, val_prob, threshold=0.5)



===== Decision Tree | threshold=0.50 =====
ROC-AUC: 0.8208
Confusion matrix:
 [[60 15]
 [ 7 18]]
              precision    recall  f1-score   support

           0     0.8955    0.8000    0.8451        75
           1     0.5455    0.7200    0.6207        25

    accuracy                         0.7800       100
   macro avg     0.7205    0.7600    0.7329       100
weighted avg     0.8080    0.7800    0.7890       100



In [166]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_split=10,
    min_samples_leaf=5,
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

val_prob = rf.predict_proba(X_val)[:, 1]
evaluate_binary("Random Forest", y_val, val_prob, threshold=0.5)



===== Random Forest | threshold=0.50 =====
ROC-AUC: 0.8427
Confusion matrix:
 [[64 11]
 [ 9 16]]
              precision    recall  f1-score   support

           0     0.8767    0.8533    0.8649        75
           1     0.5926    0.6400    0.6154        25

    accuracy                         0.8000       100
   macro avg     0.7347    0.7467    0.7401       100
weighted avg     0.8057    0.8000    0.8025       100



In [171]:
# Show only columns that contain at least one NaN (train / val / test)
nan_summary = {
    "train": X_train.isna().sum(),
    "val":   X_val.isna().sum(),
    "test":  X_test.isna().sum()
}

for split, s in nan_summary.items():
    cols = s[s > 0]
    print(f"\n{split.upper()} – columns with NaNs ({len(cols)}):")
    print(cols if len(cols) > 0 else "None")



TRAIN – columns with NaNs (0):
None

VAL – columns with NaNs (0):
None

TEST – columns with NaNs (0):
None


In [172]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)
gb.fit(X_train, y_train)

val_prob = gb.predict_proba(X_val)[:, 1]
evaluate_binary("Gradient Boosting (sklearn)", y_val, val_prob, threshold=0.5)



===== Gradient Boosting (sklearn) | threshold=0.50 =====
ROC-AUC: 0.896
Confusion matrix:
 [[68  7]
 [ 8 17]]
              precision    recall  f1-score   support

           0     0.8947    0.9067    0.9007        75
           1     0.7083    0.6800    0.6939        25

    accuracy                         0.8500       100
   macro avg     0.8015    0.7933    0.7973       100
weighted avg     0.8481    0.8500    0.8490       100



In [173]:
from xgboost import XGBClassifier

# handle imbalance: scale_pos_weight = (#neg / #pos) on TRAIN only
neg = int((y_train == 0).sum())
pos = int((y_train == 1).sum())
scale_pos_weight = neg / max(pos, 1)

xgb = XGBClassifier(
    n_estimators=800,
    learning_rate=0.03,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    min_child_weight=1,
    gamma=0,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1,
    eval_metric="logloss"
)

xgb.fit(X_train, y_train)

val_prob = xgb.predict_proba(X_val)[:, 1]
evaluate_binary("XGBoost", y_val, val_prob, threshold=0.5)



===== XGBoost | threshold=0.50 =====
ROC-AUC: 0.8752
Confusion matrix:
 [[69  6]
 [ 7 18]]
              precision    recall  f1-score   support

           0     0.9079    0.9200    0.9139        75
           1     0.7500    0.7200    0.7347        25

    accuracy                         0.8700       100
   macro avg     0.8289    0.8200    0.8243       100
weighted avg     0.8684    0.8700    0.8691       100



In [174]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(
    n_estimators=1200,
    learning_rate=0.03,
    num_leaves=31,
    subsample=0.9,
    colsample_bytree=0.9,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

lgbm.fit(X_train, y_train)

val_prob = lgbm.predict_proba(X_val)[:, 1]
evaluate_binary("LightGBM", y_val, val_prob, threshold=0.5)


[LightGBM] [Info] Number of positive: 160, number of negative: 490
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000351 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1312
[LightGBM] [Info] Number of data points in the train set: 650, number of used features: 79
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000

===== LightGBM | threshold=0.50 =====
ROC-AUC: 0.8171
Confusion matrix:
 [[69  6]
 [12 13]]
              precision    recall  f1-score   support

           0     0.8519    0.9200    0.8846        75
           1     0.6842    0.5200    0.5909        25

    accuracy                         0.8200       100
   macro avg     0.7680    0.7200    0.7378       100
weighted avg     0.8099    0.8200    0.8112       100



In [175]:
import tensorflow as tf
from tensorflow.keras import layers, callbacks, models

# optional: class weights (recommended for fraud)
neg = float((y_train == 0).sum())
pos = float((y_train == 1).sum())
total = neg + pos
class_weight = {
    0: (1.0 / neg) * (total / 2.0),
    1: (1.0 / pos) * (total / 2.0)
}

tf.keras.backend.clear_session()
tf.random.set_seed(42)

model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(256, activation="relu"),
     layers.Dropout(0.3),
    layers.Dense(128, activation="relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(64, activation="relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(1, activation="linear")
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.002),
    loss="binary_crossentropy",
    metrics=[tf.keras.metrics.AUC(name="auc")]
)

early_stop = callbacks.EarlyStopping(
    monitor="val_auc",
    mode="max",
    patience=25,
    restore_best_weights=True
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=60,
    batch_size=256,
    class_weight=class_weight,
    callbacks=[early_stop],
    verbose=1
)

val_prob = model.predict(X_val, verbose=0).ravel()
evaluate_binary("Keras NN", y_val, val_prob, threshold=0.5)



Epoch 1/60
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 125ms/step - auc: 0.4811 - loss: 6.6149 - val_auc: 0.5019 - val_loss: 3.7784
Epoch 2/60
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - auc: 0.5694 - loss: 5.5429 - val_auc: 0.5200 - val_loss: 3.8975
Epoch 3/60
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - auc: 0.6045 - loss: 5.0501 - val_auc: 0.5200 - val_loss: 3.9011
Epoch 4/60
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - auc: 0.6166 - loss: 4.8896 - val_auc: 0.5136 - val_loss: 3.8994
Epoch 5/60
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - auc: 0.6048 - loss: 5.2101 - val_auc: 0.5123 - val_loss: 3.7845
Epoch 6/60
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - auc: 0.6200 - loss: 4.8190 - val_auc: 0.5579 - val_loss: 3.4288
Epoch 7/60
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - auc: 0.6375 - loss:


=== Validation Results (sorted by F1, then Recall, then AUC) ===
                 model  threshold   roc_auc  accuracy  precision_fraud(1)  \
0  Logistic Regression        0.5  0.898133      0.89            0.718750   
1              XGBoost        0.5  0.875200      0.87            0.750000   
2    Gradient Boosting        0.5  0.896000      0.85            0.708333   
3        Decision Tree        0.5  0.820800      0.78            0.545455   
4        Random Forest        0.5  0.842667      0.80            0.592593   
5             Keras NN        0.5  0.852267      0.82            0.666667   
6             LightGBM        0.5  0.817067      0.82            0.684211   

   recall_fraud(1)  f1_fraud(1)  tp  fp  tn  fn  
0             0.92     0.807018  23   9  66   2  
1             0.72     0.734694  18   6  69   7  
2             0.68     0.693878  17   7  68   8  
3             0.72     0.620690  18  15  60   7  
4             0.64     0.615385  16  11  64   9  
5             0.5

In [4]:
# ===================== ENSEMBLE (SOFT VOTING) + EVALUATION =====================
# Uses the SAME evaluation style you used (predict_proba + threshold=0.5)
# Note: VotingClassifier cannot include the raw Keras model unless you wrap it.
# We'll ensemble sklearn/xgb/lgbm models first (recommended for tabular data).

import numpy as np
import pandas as pd

from sklearn.ensemble import VotingClassifier
from sklearn.metrics import (
    precision_recall_fscore_support,
    roc_auc_score,
    accuracy_score,
    confusion_matrix
)

# --------- helpers (same style as yours) ----------
def eval_model(model, X_eval, y_eval, threshold=0.5):
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_eval)[:, 1]
    else:
        y_prob = np.asarray(model.predict(X_eval, verbose=0)).ravel()

    y_pred = (y_prob >= threshold).astype(int)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_eval, y_pred, pos_label=1, average="binary", zero_division=0
    )
    auc = roc_auc_score(y_eval, y_prob)
    acc = accuracy_score(y_eval, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_eval, y_pred).ravel()

    return {
        "roc_auc": auc,
        "accuracy": acc,
        "precision_fraud(1)": precision,
        "recall_fraud(1)": recall,
        "f1_fraud(1)": f1,
        "tp": tp, "fp": fp, "tn": tn, "fn": fn
    }

# -------------------------
# 1) Build the soft-voting ensemble
#    (must be FITTED base models already: log_reg, dt, rf, gb, xgb, lgbm)
# -------------------------
ensemble = VotingClassifier(
    estimators=[
        ("lr", log_reg),
        ("rf", rf),
        ("gb", gb),
        ("xgb", xgb),
        ("lgbm", lgbm),
        # You can add ("dt", dt) if you want, but DT often hurts generalization
    ],
    voting="soft",
    weights=[2, 1, 1, 2, 1],   # feel free to keep simple equal weights: [1,1,1,1,1]
    n_jobs=-1
)

# IMPORTANT: this will refit the ensemble (it refits base estimators internally)
ensemble.fit(X_train, y_train)

# -------------------------
# 2) Evaluate on VALIDATION + TEST
# -------------------------
val_metrics = eval_model(ensemble, X_val, y_val, threshold=0.5)
test_metrics = eval_model(ensemble, X_test, y_test, threshold=0.5)

print("\n=== Soft Voting Ensemble (VAL) ===")
for k, v in val_metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

print("\n=== Soft Voting Ensemble (TEST) ===")
for k, v in test_metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

# -------------------------
# 3) Compare ensemble vs your best single model (Logistic Regression)
# -------------------------
lr_val = eval_model(log_reg, X_val, y_val, threshold=0.5)
lr_test = eval_model(log_reg, X_test, y_test, threshold=0.5)

summary = pd.DataFrame([
    {"model": "Logistic Regression", "split": "VAL",  **lr_val},
    {"model": "Soft Voting Ensemble","split": "VAL",  **val_metrics},
    {"model": "Logistic Regression", "split": "TEST", **lr_test},
    {"model": "Soft Voting Ensemble","split": "TEST", **test_metrics},
])

print("\n=== Ensemble vs Logistic Regression (summary) ===")
print(summary[["model","split","roc_auc","accuracy","precision_fraud(1)","recall_fraud(1)","f1_fraud(1)","tp","fp","tn","fn"]])


NameError: name 'xgb' is not defined