In [5]:
import pandas as pd
import numpy as np

class InsurancePreprocessor:
    def __init__(self):
        self.keep_features = [
            'subscription_length',
            'customer_age',
            'torque_Nm',
            'torque_rpm',
            'log_region_density',
            'model_encoded',
            'vehicle_age_bin_encoded',
            'fuel_type_Diesel',
            'fuel_type_Petrol',
            'segment_B2',
            'segment_C1',
            'segment_C2',
            'segment_Utility'
        ]
        self.bool_cols = [
            'fuel_type_Diesel', 'fuel_type_Petrol',
            'segment_B2', 'segment_C1', 'segment_C2', 'segment_Utility'
        ]

    def extract_torque(self, df):
        # Parse "91Nm@4250rpm" into two columns
        torque_split = df['max_torque'].str.extract(r'(?P<torque_Nm>\d+)[^\d]+(?P<torque_rpm>\d+)')
        df['torque_Nm'] = torque_split['torque_Nm'].astype(float)
        df['torque_rpm'] = torque_split['torque_rpm'].astype(float)
        return df

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()

        # Step 1: Extract torque columns
        if 'torque_Nm' not in df.columns or 'torque_rpm' not in df.columns:
            df = self.extract_torque(df)

        # Step 2: Filter invalid rows
        df = df[df['torque_Nm'] != 1]
        df = df[df['subscription_length'] != 0]

        # Step 3: Log tra~nsform of region_density
        if 'log_region_density' not in df.columns:
            df['log_region_density'] = np.log1p(df['region_density'])

        # Step 4: Vehicle age binning
        if 'vehicle_age_bin_encoded' not in df.columns:
            df['vehicle_age_bin'] = pd.cut(
                df['vehicle_age'],
                bins=[-1, 1, 3, 10, 100],
                labels=['<1yr', '1-3yr', '3-10yr', '10yr+']
            )
            df['vehicle_age_bin'] = df['vehicle_age_bin'].cat.reorder_categories(
                ['<1yr', '1-3yr', '3-10yr', '10yr+'], ordered=True
            )
            df['vehicle_age_bin_encoded'] = df['vehicle_age_bin'].cat.codes

        # Step 5: Model frequency encoding
        if 'model_encoded' not in df.columns:
            model_freq = df['model'].value_counts()
            df['model_encoded'] = df['model'].map(model_freq)

        # Step 6: Drop unused columns (retain policy_id for reference)
        drop_cols = [
            'policy_id', 'model', 'region_code', 'max_torque',
            'region_density', 'vehicle_age', 'vehicle_age_bin',
            'age_bin'  # include if exists
        ]
        df.drop(columns=drop_cols, errors='ignore', inplace=True)

        # Step 7: One-hot encode
        df = pd.get_dummies(df, columns=['fuel_type', 'segment'], drop_first=True)

        # Step 8: Ensure all expected one-hot columns exist
        for col in self.bool_cols:
            if col not in df.columns:
                df[col] = 0
            df[col] = df[col].astype(int)

        # Step 9: Final selection
        df_final = df[self.keep_features]
        return df_final


In [2]:
import joblib

# Instantiate preprocessor
preprocessor = InsurancePreprocessor()

# Save as a pkl file
joblib.dump(preprocessor, "insurance_preprocessor.pkl")


['insurance_preprocessor.pkl']

In [6]:
unseen_df = pd.read_csv('..\\Assests\\data\\unseenIncoming.csv')
policy_ids = unseen_df["policy_id"]

preprocessor = InsurancePreprocessor()
X_unseen = preprocessor.transform(unseen_df)

import pickle
with open("..\\Assests\\model\\standard_scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

X_unseen_scaled = scaler.transform(X_unseen)

In [1]:
import mlflow


In [None]:
import mlflow
import joblib

# Step 1: Set tracking URI
mlflow.set_tracking_uri("http://127.0.0.1:5000")  # Update if hosted remotely

# Step 2: Load the model from Model Registry
model = mlflow.sklearn.load_model("models:/XGboost_modelReady/1")

# Step 3: Save as .pkl
joblib.dump(model, "xgboost_model.pkl")
print("✅ Model saved to xgboost_model.pkl")


In [7]:
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
model = mlflow.sklearn.load_model("models:/XGboost_modelReady/1")

# 6. Predict
predictions = model.predict(X_unseen_scaled)
probabilities = model.predict_proba(X_unseen_scaled)[:, 1]

# 7. Combine with policy_id
results = pd.DataFrame({
    "policy_id": policy_ids,
    "prediction": predictions,
    "predicted_proba": probabilities
})

In [8]:
results

Unnamed: 0,policy_id,prediction,predicted_proba
0,POL030455,0,0.339529
1,POL008274,0,0.247029
2,POL008619,0,0.475522
3,POL052957,0,0.242513
4,POL057486,0,0.366139


In [9]:
def assign_risk_segment(proba):
    if proba >= 0.50:
        return "High Risk"
    elif proba >= 0.35:
        return "Medium Risk"
    else:
        return "Low Risk"

# Assuming your DataFrame is named `results_df`
results["risk_segment"] = results["predicted_proba"].apply(assign_risk_segment)

# Optional: Sort or filter
results[["policy_id", "predicted_proba", "risk_segment"]]


Unnamed: 0,policy_id,predicted_proba,risk_segment
0,POL030455,0.339529,Low Risk
1,POL008274,0.247029,Low Risk
2,POL008619,0.475522,Medium Risk
3,POL052957,0.242513,Low Risk
4,POL057486,0.366139,Medium Risk




### 📊 **What Data We Used — In Business Language**

> “To train this model, we used a dataset that reflects real-world customer profiles and vehicle information — the same type of data we already collect during policy onboarding and servicing. Specifically, we used:

#### 👤 Customer Information:

* **Age of the customer**
* **How long they’ve been with us** (subscription length)
* **Driving behavior proxies** like engine torque (from vehicle model)

#### 🚗 Vehicle Attributes:

* **Vehicle age** (grouped as: brand new, midlife, old, etc.)
* **Model popularity/frequency** (how common the vehicle model is among our customers)
* **Fuel type** and **vehicle segment** (e.g., city car, SUV, etc.)

#### 🌍 Location-Based Risk Factors:

* **Region density** (urban vs rural)
* **Engine torque details** — derived from technical specs



### 💼 Why This Data Matters:

> “These features combine both **customer behavior** and **vehicle risk exposure**. For example:

* Younger customers with high-torque vehicles in dense cities may be **more claim-prone**.
* A long-time customer with a mid-tier vehicle in a rural area might be **low risk**.

I used these natural signals to teach our AI model what patterns **often lead to claims**, and which don't.”




Yes, **that’s exactly how many modern insurance companies operate** — especially large players like **Admiral, AXA, Allstate, Aviva, Zurich**. You’ve nailed the real-world use case. Here's how it typically works:

---

### ✅ Industry Practice: Risk-Based Claims Handling

#### 🔵 1. **Risk Profiling → Dual Path Strategy**

After onboarding or during claim submission:

* The **ML model assigns a risk score** based on customer attributes (like you’ve done)
* This score then helps route the claim through different workflows:

| Risk Level    | If Claim is Filed | Action Taken                                                                      |
| ------------- | ----------------- | --------------------------------------------------------------------------------- |
| **High Risk** | Claim expected    | **In-depth checks**: cross-verify with telematics, repair history, 3rd-party data |
| **Low Risk**  | Claim unexpected  | **Light fast-track** if documents are clean; **flag for audit** if anomalies      |

---

#### 🔶 2. **Data Used in Real-Time Checks**

* Telematics (black box, mobile app data): to verify **speed, braking, accident force**
* External sources: weather, traffic logs, CCTV (for serious cases)
* Historical claim behavior: for similar models/regions

---

#### 🔍 3. **Why It Matters**

* Reduces **fraud losses** (billions yearly across the UK)
* Improves **claims handling efficiency**
* Enables **dynamic pricing** in future renewals
* Strengthens **regulatory compliance** (e.g. FCA fairness rules)

---

### 🧠 Strategic Value You’re Building

You’ve created the **foundation for intelligent triage**:

* You're **predicting claim risk from profile data**
* This can now be **connected to downstream actions**: documentation requirements, telematics verification, or fast-lane approvals


---

### 🔍 Scenario: Low-Risk Customer Files a Claim (Unexpected)

Your ML model flags the customer as **low risk** — meaning:

* Clean driving record
* Good vehicle type
* Long subscription
* Low torque engine (less aggressive)
* No previous claims

#### 🎯 Now a claim is filed. What happens?

---

### ✅ What to Do: **Fast-Track but Monitor**

#### 1. **Auto-Fast-Track Logic**

* The system checks:

  * ✅ Documents submitted? (photos, receipts, police report)
  * ✅ Claim value < £1,000? (threshold set by business)
  * ✅ No red flags in metadata (e.g., duplicate IP address, odd timestamps)

If all clear → **claim is paid quickly (in hours to 1 day)**.

#### 2. **Auto-Anomaly Checks**

Even for low-risk customers, if anything looks *odd*, system flags for manual audit:

* 📷 **Image forensics**: damage photo EXIF timestamps don’t match report time
* 📍 **Location mismatch**: accident location is 300 miles from customer address
* 📞 **Voice/text inconsistency**: if chatbot or agent logs are used, NLP may detect scripted responses

---

### 🧠 Why This Matters for Business

| Feature                        | Value to Insurer                                 |
| ------------------------------ | ------------------------------------------------ |
| Fast-track low-risk claims     | Improves **customer satisfaction** and retention |
| Audit anomaly even in low risk | Catches **rare but damaging fraud** cases        |
| Smart prioritization           | Saves **manual hours** and avoids bottlenecks    |

---

### 🔗 Real-World Tools That Enable This

| Task                   | Tools Used                                                      |
| ---------------------- | --------------------------------------------------------------- |
| Document checks        | OCR + metadata extraction (AWS Textract, Azure Form Recognizer) |
| Image fraud detection  | Forensics tools or ML-based image tampering models              |
| Risk model integration | ML model served via API in claims portal                        |
| Workflow routing       | Business rules engine (like Pega, Camunda)                      |
