# Insurance Risk Analytics: Regression Modeling

This notebook covers the full workflow for predicting insurance claims using regression models.

In [1]:
# 1. Import libraries and load data
import pandas as pd
import numpy as np

data = pd.read_csv('../data/cleaned_insurance_data.csv')
print("Data shape:", data.shape)
data.head()

  data = pd.read_csv('../data/cleaned_insurance_data.csv')


Data shape: (611066, 53)


Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims,LossRatio
0,145249,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0,0.0
1,145249,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0,0.0
2,145255,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0,0.0
3,145247,12827,2015-01-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Third Party,Third Party,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,3.256435,0.0,0.0
4,145247,12827,2015-04-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Third Party,Third Party,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,50.474737,0.0,0.0


## 2. Data Exploration & Cleaning

Check for missing values and handle them appropriately.

In [2]:
# Check missing values
print("Missing values per column:")
print(data.isnull().sum())

# Drop columns with too many missing values (if any)
threshold = 0.5  # drop columns with >50% missing
cols_to_drop = data.columns[data.isnull().mean() > threshold]
data = data.drop(columns=cols_to_drop)
print("Dropped columns:", list(cols_to_drop))

# For numeric columns, fill missing with median
num_cols = data.select_dtypes(include=[np.number]).columns
data[num_cols] = data[num_cols].fillna(data[num_cols].median())

# For categorical columns, fill missing with mode
cat_cols = data.select_dtypes(include=['object', 'category', 'bool']).columns
for col in cat_cols:
    if data[col].isnull().any():
        data[col] = data[col].fillna(data[col].mode()[0])

print("Any missing left?", data.isnull().any().any())

Missing values per column:
UnderwrittenCoverID              0
PolicyID                         0
TransactionMonth                 0
IsVATRegistered                  0
Citizenship                      0
LegalType                        0
Title                            0
Language                         0
Bank                         82350
AccountType                  30670
MaritalStatus                  450
Gender                           0
Country                          0
Province                         0
PostalCode                       0
MainCrestaZone                   0
SubCrestaZone                    0
ItemType                         0
mmcode                         215
VehicleType                    215
RegistrationYear                 0
make                           215
Model                          215
Cylinders                      215
cubiccapacity                  215
kilowatts                      215
bodytype                       215
NumberOfDoors               

## 3. Feature Selection & Encoding

Select features and encode categorical variables.

In [3]:
# Select features (drop high cardinality or irrelevant columns)
target = 'TotalClaims'
drop_cols = ['TotalClaims', 'PolicyID', 'UnderwrittenCoverID', 'TransactionMonth', 'VehicleIntroDate']  # adjust as needed

# Keep only numeric and relevant categorical columns
features = [col for col in data.columns if col not in drop_cols]

X = data[features]
y = data[target]

# Encode categorical variables
X = pd.get_dummies(X, drop_first=True)
print("Feature matrix shape after encoding:", X.shape)

Feature matrix shape after encoding: (611066, 1819)


## 4. Train/Test Split

Split the data for training and testing.

In [4]:
from sklearn.model_selection import train_test_split

if len(X) == 0 or len(y) == 0:
    print("X or y is empty. Please check your data preprocessing steps.")
    X_train, X_test, y_train, y_test = None, None, None, None
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (488852, 1819) Test shape: (122214, 1819)


## 5. Model Training

Train Linear Regression and Random Forest models.

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
if X_train is not None and y_train is not None:
    lr.fit(X_train, y_train)
else:
    print("Training data not available. Please check your preprocessing steps.")

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42)
if X_train is not None and y_train is not None:
    rf.fit(X_train, y_train)
else:
    print("Training data not available. Please check your preprocessing steps.")

Training data not available. Please check your preprocessing steps.


## 6. Model Evaluation

Evaluate both models using RMSE, MAE, and R².

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, X_test, y_test):
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R^2: {r2:.3f}")

if X_test is not None and y_test is not None:
    print("Linear Regression:")
    evaluate_model(lr, X_test, y_test)
    print("\nRandom Forest:")
    evaluate_model(rf, X_test, y_test)
else:
    print("Test data not available. Please check your preprocessing steps.")

Test data not available. Please check your preprocessing steps.


## 7. Feature Importance

Visualize feature importances for Random Forest and coefficients for Linear Regression.

In [None]:
import matplotlib.pyplot as plt

if hasattr(rf, "feature_importances_"):
    importances = rf.feature_importances_
    features = X.columns
    indices = np.argsort(importances)[::-1][:10]  # top 10
    plt.figure(figsize=(8,5))
    plt.barh(range(len(indices)), importances[indices][::-1], align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices][::-1])
    plt.xlabel('Feature Importance')
    plt.title('Top 10 Random Forest Feature Importances')
    plt.tight_layout()
    plt.show()
else:
    print("RandomForestRegressor is not fitted. Please fit the model before plotting feature importances.")

RandomForestRegressor is not fitted. Please fit the model before plotting feature importances.


In [None]:
if hasattr(lr, "coef_"):
    coefficients = pd.Series(lr.coef_, index=X.columns)
    print("Top 10 Linear Regression coefficients (by absolute value):")
    print(coefficients.abs().sort_values(ascending=False).head(10))
else:
    print("LinearRegression model is not fitted or no coefficients are available.")

LinearRegression model is not fitted or no coefficients are available.


## 8. Save the Best Model

Save the Random Forest model for future use.

In [None]:
import joblib

joblib.dump(rf, 'random_forest_model.joblib')
print("Random Forest model saved as 'random_forest_model.joblib'")

['random_forest_model.joblib']