# Insurance Fraud Detection Model
This notebook demonstrates the training and evaluation of a machine learning model to detect fraudulent insurance claims.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
import shap
import joblib


In [None]:
df = pd.read_csv("data/claims.csv")
df.head()


In [None]:
X = df[['claim_amount', 'claimant_age', 'incident_description']]
y = df['is_fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), ['claim_amount', 'claimant_age']),
    ('txt', TfidfVectorizer(max_features=100), 'incident_description')
])

pipeline = Pipeline([
    ('pre', preprocessor),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

pipeline.fit(X_train, y_train)


In [None]:
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("AUC Score:", roc_auc_score(y_test, y_proba))


In [None]:
explainer = shap.Explainer(pipeline.named_steps['clf'])
shap_values = explainer(pipeline.named_steps['pre'].transform(X_test))
shap.plots.waterfall(shap_values[0])


In [None]:
joblib.dump(pipeline, "app/model_pipeline.pkl")
print("Model saved to app/model_pipeline.pkl")
