### Insurance Dataset. Feature Engineering Pipeline

This notebook builds a modeling dataset from the raw insurance file.
 

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

#### Load raw dataset

Reads the cleaned insurance dataset and parses date columns required for age and experience features.

In [None]:
# Load the dataset
df = pd.read_csv(
    "../../data/output/insurance_new.csv",
    parse_dates=["Date_birth", "Date_driving_licence"]
)


# Print the shape of the dataset
print(f"Dataset loaded. Shape: {df.shape[0]:,} rows, {df.shape[1]:,} columns")

#### Remove duplicate rows

Creates a working copy and ensures duplicate records do not distort feature creation.

In [None]:
# Create a copy of the dataset for processing
df_cp = df.copy()

# Drop duplicates
df_cp = df_cp.drop_duplicates().reset_index(drop=True)

# Print the shape after deduplication
print(f"After deduplication. Shape: {df_cp.shape}")

#### Time and driver risk features

Creates age, driving experience, and early-career risk indicators.
These are core underwriting signals.

In [None]:
# Epsilon value for numerical stability
epsilon = 1e-10

# Create driver-related features
# Calculate age, driving experience, and age at license
df_cp["PolicyHolder_Age"] = 2018 - df_cp["Date_birth"].dt.year 
df_cp["Driving_experience_years"] = 2018 - df_cp["Date_driving_licence"].dt.year #
df_cp["Age_at_license"] = (
    df_cp["Date_driving_licence"].dt.year - df_cp["Date_birth"].dt.year
)

# Create flags for young and inexperienced drivers
# Young driver flag (under 25 years old)
df_cp["Young_driver_flag"] = (
    df_cp["PolicyHolder_Age"] < 25
).astype(int).map({0: "No", 1: "Yes"})

# Inexperienced driver flag (less than 3 years of experience)
df_cp["Inexperienced_driver_flag"] = (
    df_cp["Driving_experience_years"] < 3
).astype(int).map({0: "No", 1: "Yes"})

# Create a risk proxy for second drivers with a young driver flag
df_cp["Second_driver_risk_proxy"] = np.where(
    (df_cp["Second_driver"] == "Yes")
    & (df_cp["Young_driver_flag"] == "Yes"),
    "High Risk",
    "Normal Risk",
)

print("Driver features created")

#### Financial pressure indicators

Flags high-value vehicles relative to dataset distribution.

In [None]:
# Create financial features
# High value vehicle flag (top 25% by value)
df_cp["High_value_vehicle_flag"] = (
    df_cp["Value_vehicle"] > df_cp["Value_vehicle"].quantile(0.75)
).astype(int).map({0: "No", 1: "Yes"})

print("Financial features created")

#### Vehicle performance and size features

Creates power, size, and performance ratios that correlate with claim severity.

In [None]:
# Create vehicle performance features
# Vehicle age, power-to-weight ratio, engine intensity, large vehicle flag, sporty vehicle flag
df_cp["Vehicle_age"] = 2018 - df_cp["Year_matriculation"]
df_cp["Power_to_weight_ratio"] = df_cp["Power"] / (df_cp["Weight"] + epsilon)
df_cp["Engine_intensity"] = df_cp["Power"] / (
    df_cp["Cylinder_capacity"] + epsilon
)

# Large vehicle flag (top 25% by weight or length)
df_cp["Large_vehicle_flag"] = (
    (df_cp["Weight"] > df_cp["Weight"].quantile(0.75))
    | (df_cp["Length"] > df_cp["Length"].quantile(0.75))
).astype(int).map({0: "No", 1: "Yes"})

# Sporty vehicle flag (top 25% by power-to-weight ratio)
df_cp["Sporty_vehicle_flag"] = (
    df_cp["Power_to_weight_ratio"]
    > df_cp["Power_to_weight_ratio"].quantile(0.75)
).astype(int).map({0: "No", 1: "Yes"})

print("Vehicle performance features created")

#### Vehicle value and depreciation features

Captures value decay, new vs old vehicles, and log-scaled value.

In [None]:
# Create vehicle value features
# Log-transform the vehicle value to reduce skewness
df_cp["Log_vehicle_value"] = np.log1p(df_cp["Value_vehicle"])
# Value per year of vehicle age (to capture depreciation)
df_cp["Value_per_year"] = df_cp["Value_vehicle"] / (df_cp["Vehicle_age"] + 1)

# Create flags for new and old vehicles
# New vehicle flag (1 year old or less)
df_cp["New_vehicle_flag"] = (
    df_cp["Vehicle_age"] <= 1
).astype(int).map({0: "No", 1: "Yes"})

# Old vehicle flag (over 10 years old)
df_cp["Old_vehicle_flag"] = (
    df_cp["Vehicle_age"] > 10
).astype(int).map({0: "No", 1: "Yes"})

print("Vehicle value features created")

#### Risk proxy features

Creates engine size buckets, fuel risk class, and high-power flags.

In [None]:
# Create risk buckets based on key features
# Engine size class, high power flag, fuel type risk bucket, vehicle size proxy
# Engine size class based on cylinder capacity
df_cp["Engine_size_class"] = pd.cut(
    df_cp["Cylinder_capacity"],
    bins=[0, 1000, 1500, 2000, 3000, np.inf],
    labels=["Very Small", "Small", "Medium", "Large", "Very Large"],
)

# High power flag (top 10% by power)
df_cp["High_power_flag"] = (
    df_cp["Power"] >= df_cp["Power"].quantile(0.9)
).astype(int).map({0: "No", 1: "Yes"})

# Fuel type risk bucket (diesel often considered higher risk than petrol)
df_cp["Fuel_risk_bucket"] = df_cp["Type_fuel"].map(
    {"diesel": "Higher Risk", "petrol": "Normal Risk"}
)

# Vehicle size proxy (combination of weight and length)
df_cp["Vehicle_size_proxy"] = df_cp["Length"] * df_cp["Weight"]

print("Risk bucket features created")

#### Interaction features

Combines driver risk with vehicle risk.
These often drive model performance improvements.

In [None]:
# Create interaction features
# Young driver with high power, inexperienced driver with sporty vehicle, 
# inexperienced driver with large vehicle, urban area with high value vehicle, 
# half-yearly payment with young driver, second driver with young driver

# Young driver with high power
df_cp["Young_driver_High_power"] = np.where(
    (df_cp["Young_driver_flag"] == "Yes")
    & (df_cp["High_power_flag"] == "Yes"),
    "High Risk",
    "Normal Risk",
)

# Inexperienced driver with sporty vehicle
df_cp["Inexperienced_driver_Sporty_vehicle"] = np.where(
    (df_cp["Inexperienced_driver_flag"] == "Yes")
    & (df_cp["Sporty_vehicle_flag"] == "Yes"),
    "High Risk",
    "Normal Risk",
)

# Inexperienced driver with large vehicle
df_cp["Inexperienced_driver_Large_vehicle"] = np.where(
    (df_cp["Inexperienced_driver_flag"] == "Yes")
    & (df_cp["Large_vehicle_flag"] == "Yes"),
    "High Risk",
    "Normal Risk",
)

# Urban area with high value vehicle
df_cp["Urban_area_High_value"] = np.where(
    (df_cp["Area"] == "urban")
    & (df_cp["High_value_vehicle_flag"] == "Yes"),
    "High Risk",
    "Normal Risk",
)

# Half-yearly payment with young driver
df_cp["half_yearly_payment_Young_driver"] = np.where(
    (df_cp["Payment"] == "half-yearly")
    & (df_cp["Young_driver_flag"] == "Yes"),
    "High Risk",
    "Normal Risk",
)

# Second driver with young driver
df_cp["Second_driver_Young_driver"] = np.where(
    (df_cp["Second_driver"] == "Yes")
    & (df_cp["Young_driver_flag"] == "Yes"),
    "High Risk",
    "Normal Risk",
)

print("Interaction features created")

#### Final cleanup

Removes raw date columns and prepares modeling dataset.
Separates features and target.

In [None]:
# Drop original date columns and convert Year_matriculation to string for modeling

# Drop original date columns
date_columns = [c for c in df_cp.columns if c.startswith("Date_")]
df_cp = df_cp.drop(columns=date_columns)

# Convert Year_matriculation to string for modeling
df_cp["Year_matriculation"] = df_cp["Year_matriculation"].astype("str")

# Prepare features and target variable
features = df_cp.drop(columns=["Premium"])
target = np.log1p(df_cp[["Premium"]])

print("Dataset ready")
print(f"Features: {features.shape[0]:,} rows, {features.shape[1]:,} columns")
print(f"Target: {target.shape[0]:,} rows, {target.shape[1]:,} columns")

#### Save the processed data to output folder

In [None]:
# Save features and target to separate CSV files for modeling
features.to_csv("../../data/output/insurance_features.csv", index=False)  
target.to_csv("../../data/output/insurance_target.csv", index=False)