In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv('../data/insurance_claims.csv')
df.replace('?', np.nan, inplace=True)

for col in ['collision_type', 'property_damage', 'police_report_available', 'authorities_contacted']:
    df[col] = df[col].fillna('missing')

df.drop(columns=['policy_number', 'policy_bind_date', 'policy_csl','insured_zip','incident_date','incident_location', 
                 'policy_state','incident_city', 'insured_relationship', 'auto_make', 'auto_model', 'auto_year'], inplace=True)

y = LabelEncoder().fit_transform(df['fraud_reported'])
df.drop('fraud_reported', axis=1, inplace=True)

X = pd.get_dummies(df, drop_first=True)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42, stratify=y)
sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)


param_grid = {'n_estimators': [100], 'max_depth': [5, 8], 'min_samples_split': [2, 5]}
grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='f1', n_jobs=-1)
grid.fit(X_train_bal, y_train_bal)

best_rf = grid.best_estimator_
y_probs = best_rf.predict_proba(X_test)[:, 1]


In [None]:
iso = IsolationForest(contamination=0.1, random_state=42)
iso.fit(X_train_bal)
anomaly_scores = -iso.decision_function(X_test)

combined_risk = (y_probs + anomaly_scores) / 2
risk_df = pd.DataFrame({
    'fraud_probability': y_probs,
    'anomaly_score': anomaly_scores,
    'combined_risk_score': combined_risk,
    'actual_label': y_test
})

top_n = int(0.10 * len(risk_df))
risk_df_sorted = risk_df.sort_values('combined_risk_score', ascending=False).copy()
risk_df_sorted['flagged'] = 0
risk_df_sorted.iloc[:top_n, risk_df_sorted.columns.get_loc('flagged')] = 1


In [None]:
risk_df_sorted.to_csv('../data/risk_scores.csv', index=False)
with open('../data/auc.txt', 'w') as f:
    f.write(str(roc_auc_score(y_test, y_probs)))

print("✅ Model trained and scores saved.")