In [1]:

import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler


In [2]:
DATA_FILE = "insurance_claims_synthetic.csv"
MODEL_OUT = "model.pkl"
SCALER_OUT = "scaler.pkl"
PROCESSED_OUT = "processed_output.csv"

In [3]:
df = pd.read_csv(DATA_FILE)


In [4]:
df.head()

Unnamed: 0,policy_id,age,vehicle_price,annual_premium,claim_amount,accident_severity,past_claims,policy_tenure_months,fraud_reported
0,1,56,1437715,57877,74964,1,4,104,0
1,2,69,1429024,21163,168509,0,4,80,0
2,3,46,1154480,58728,119006,1,1,31,0
3,4,32,435659,10782,50583,0,4,53,0
4,5,60,589678,27641,38392,0,2,106,0


In [5]:
df.shape

(1200, 9)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   policy_id             1200 non-null   int64
 1   age                   1200 non-null   int64
 2   vehicle_price         1200 non-null   int64
 3   annual_premium        1200 non-null   int64
 4   claim_amount          1200 non-null   int64
 5   accident_severity     1200 non-null   int64
 6   past_claims           1200 non-null   int64
 7   policy_tenure_months  1200 non-null   int64
 8   fraud_reported        1200 non-null   int64
dtypes: int64(9)
memory usage: 84.5 KB


In [7]:
df.describe()

Unnamed: 0,policy_id,age,vehicle_price,annual_premium,claim_amount,accident_severity,past_claims,policy_tenure_months,fraud_reported
count,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0
mean,600.5,46.3175,877794.6,31868.345,99773.650833,0.679167,2.465833,60.7225,0.07
std,346.554469,16.451157,371984.0,15682.924818,57175.134844,0.866136,1.718903,33.626122,0.255253
min,1.0,18.0,200126.0,5097.0,138.0,0.0,0.0,1.0,0.0
25%,300.75,33.0,548399.0,18069.75,51318.25,0.0,1.0,33.0,0.0
50%,600.5,46.0,882457.5,31719.5,99753.0,0.0,2.0,61.0,0.0
75%,900.25,61.0,1211402.0,45225.75,149713.25,1.0,4.0,89.0,0.0
max,1200.0,74.0,1499445.0,59929.0,199971.0,3.0,5.0,119.0,1.0


In [8]:
df.isnull().sum()

policy_id               0
age                     0
vehicle_price           0
annual_premium          0
claim_amount            0
accident_severity       0
past_claims             0
policy_tenure_months    0
fraud_reported          0
dtype: int64

In [9]:
df.duplicated().sum()

0

In [10]:
df['claim_to_vehicle_ratio'] = df['claim_amount'] / (df['vehicle_price'] + 1)
df['premium_to_vehicle_ratio'] = df['annual_premium'] / (df['vehicle_price'] + 1)

In [11]:
df['past_claims_flag'] = (df['past_claims'] >= 2).astype(int)
df['severity_norm'] = df['accident_severity'] / (df['accident_severity'].max() + 1)

In [12]:
feature_cols = [
    'age',
    'vehicle_price',
    'annual_premium',
    'claim_amount',
    'claim_to_vehicle_ratio',
    'premium_to_vehicle_ratio',
    'past_claims',
    'past_claims_flag',
    'policy_tenure_months',
    'severity_norm'
]

In [13]:
X = df[feature_cols].copy()

In [14]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [15]:
iso = IsolationForest(
    n_estimators=300,
    contamination=0.03,  # tunable
    max_samples='auto',
    random_state=101,
    verbose=0
)
iso.fit(X_scaled)

In [16]:
raw_score = iso.decision_function(X_scaled)
anom_score = -raw_score
anom_min, anom_max = anom_score.min(), anom_score.max()
anom_score_norm = (anom_score - anom_min) / (anom_max - anom_min + 1e-9)

In [17]:
rule_boost = (
    (X['claim_to_vehicle_ratio'] > 0.2).astype(int) * 0.25 +
    (X['past_claims'] >= 3).astype(int) * 0.20 +
    (X['claim_amount'] > X['annual_premium'] * 3).astype(int) * 0.15
)

In [18]:
combined_score = 0.75 * anom_score_norm + 0.25 * (rule_boost.clip(0,1))
# clamp 0..1
combined_score = np.clip(combined_score, 0, 1)

In [19]:
pct = 0.015
thresh = max(np.quantile(combined_score, 1 - pct), 0.65)

flags = (combined_score >= thresh).astype(int)
labels = np.where(flags == 1, "Suspicious", "Normal")

In [20]:
df['anomaly_score'] = combined_score.round(4)
df['anomaly_flag'] = flags
df['anomaly_label'] = labels

In [21]:
if 'fraud_reported' in df.columns:
    if df['fraud_reported'].nunique() > 1:
        true = df['fraud_reported'].astype(int)
        flagged = df['anomaly_flag'].astype(int)
        tp = int(((true==1) & (flagged==1)).sum())
        fp = int(((true==0) & (flagged==1)).sum())
        fn = int(((true==1) & (flagged==0)).sum())
        print("Quick eval (using fraud_reported): TP={}, FP={}, FN={}".format(tp, fp, fn))

Quick eval (using fraud_reported): TP=2, FP=16, FN=82


In [22]:
pickle.dump(iso, open(MODEL_OUT, "wb"))
pickle.dump(scaler, open(SCALER_OUT, "wb"))
df.to_csv(PROCESSED_OUT, index=False)

In [23]:
print("Saved:", MODEL_OUT, SCALER_OUT, PROCESSED_OUT)
print("Suspicious count:", int(df['anomaly_flag'].sum()))

Saved: model.pkl scaler.pkl processed_output.csv
Suspicious count: 18
