# GLM Model Development

Build a Poisson GLM with exposure offset for insurance pricing. Includes feature engineering, model fitting, and predictions.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families
from statsmodels.tools.tools import add_constant
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported")

✓ Libraries imported


## Load Processed Data

In [2]:
# Load processed data
df = pd.read_csv("../data/processed/processed.csv")

print(f"Loaded dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Convert categorical columns
categorical_cols = ['DrivAge_group', 'VehAge_group']
for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].astype('category')

Loaded dataset shape: (678013, 15)
Columns: ['IDpol', 'ClaimNb', 'Exposure', 'Area', 'VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'VehBrand', 'VehGas', 'Density', 'Region', 'freq', 'DrivAge_group', 'VehAge_group']


## Prepare Features for GLM

In [3]:
# Prepare data for GLM modeling
# Create dummy variables for categorical features
feature_cols = ['DrivAge_group', 'VehAge_group', 'VehPower', 'Region']
X_raw = pd.get_dummies(df[feature_cols], drop_first=True)
# Convert boolean columns to numeric (required by GLM)
X_raw = X_raw.astype('float64')
X = add_constant(X_raw)  # Add intercept

# Define target and offset
y = df['ClaimNb']  # Claims count
exposure = df['Exposure']  # Exposure offset

print("=" * 80)
print("GLM MODEL SETUP")
print("=" * 80)
print(f"Features (including constant): {X.shape[1]}")
print(f"Observations: {X.shape[0]}")
print(f"Target variable (ClaimNb) - Mean: {y.mean():.4f}, Std: {y.std():.4f}")
print(f"Exposure - Mean: {exposure.mean():.4f}, Std: {exposure.std():.4f}")
print(f"\nFeature names (first 10): {list(X.columns[:10])}")

GLM MODEL SETUP
Features (including constant): 31
Observations: 678013
Target variable (ClaimNb) - Mean: 0.0532, Std: 0.2401
Exposure - Mean: 0.5288, Std: 0.3644

Feature names (first 10): ['const', 'VehPower', 'DrivAge_group_25-35', 'DrivAge_group_35-50', 'DrivAge_group_50-70', 'DrivAge_group_70+', 'VehAge_group_1-5Y', 'VehAge_group_10-20Y', 'VehAge_group_20+Y', 'VehAge_group_5-10Y']


## Fit Poisson GLM

In [4]:
# Fit Poisson GLM with exposure offset
glm_model = GLM(y, X, family=families.Poisson(), offset=np.log(exposure))
glm_results = glm_model.fit()

print("\n" + "=" * 80)
print("POISSON GLM RESULTS - PRICING MODEL")
print("=" * 80)
print(glm_results.summary())


POISSON GLM RESULTS - PRICING MODEL
                 Generalized Linear Model Regression Results                  
Dep. Variable:                ClaimNb   No. Observations:               678013
Model:                            GLM   Df Residuals:                   677982
Model Family:                 Poisson   Df Model:                           30
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -1.4515e+05
Date:                Fri, 13 Feb 2026   Deviance:                   2.2098e+05
Time:                        15:36:15   Pearson chi2:                 1.74e+06
No. Iterations:                     7   Pseudo R-squ. (CS):           0.005151
Covariance Type:            nonrobust                                         
                          coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------

## Save Model Summary & Extract Key Metrics

In [5]:
# Save model summary to file
with open('../results/03_GLM_Model_Summary.txt', 'w') as f:
    f.write(glm_results.summary().as_text())
print("✓ Model summary saved to: 03_GLM_Model_Summary.txt")

# Extract and display key metrics
print("\n" + "=" * 80)
print("MODEL PERFORMANCE METRICS")
print("=" * 80)
aic_value = glm_results.aic
bic_value = glm_results.bic
deviance_value = glm_results.deviance
null_deviance_value = glm_results.null_deviance
pseudo_r2_value = 1 - (deviance_value / null_deviance_value)

print(f"AIC (Akaike Information Criterion): {aic_value:.2f}")
print(f"BIC (Bayesian Information Criterion): {bic_value:.2f}")
print(f"Deviance: {deviance_value:.2f}")
print(f"Null Deviance: {null_deviance_value:.2f}")
print(f"Pseudo R-squared (1 - Deviance/Null Deviance): {pseudo_r2_value:.4f}")

✓ Model summary saved to: 03_GLM_Model_Summary.txt

MODEL PERFORMANCE METRICS
AIC (Akaike Information Criterion): 290364.87
BIC (Bayesian Information Criterion): -8882231.41
Deviance: 220979.85
Null Deviance: 224481.15
Pseudo R-squared (1 - Deviance/Null Deviance): 0.0156


## Generate Predictions

In [6]:
# Generate predictions and calculate performance metrics
df['predicted_freq'] = glm_results.predict(X)
df['predicted_claims'] = df['predicted_freq'] * df['Exposure']

# Define baseline model: simple mean frequency
baseline_freq = df['freq'].mean()
df['baseline_freq'] = baseline_freq
df['baseline_claims'] = baseline_freq * df['Exposure']

# Calculate residuals
df['residuals'] = df['freq'] - df['predicted_freq']
df['standardized_residuals'] = df['residuals'] / df['residuals'].std()

# Calculate performance metrics
mae_glm = np.mean(np.abs(df['freq'] - df['predicted_freq']))
mae_baseline = np.mean(np.abs(df['freq'] - baseline_freq))
mae_improvement = ((mae_baseline - mae_glm) / mae_baseline * 100)

rmse_glm = np.sqrt(np.mean((df['freq'] - df['predicted_freq'])**2))
rmse_baseline = np.sqrt(np.mean((df['freq'] - baseline_freq)**2))
rmse_improvement = ((rmse_baseline - rmse_glm) / rmse_baseline * 100)

print("\n" + "=" * 80)
print("MODEL vs BASELINE COMPARISON")
print("=" * 80)
print(f"\nMean Absolute Error (MAE):")
print(f"  GLM:      {mae_glm:.6f}")
print(f"  Baseline: {mae_baseline:.6f}")
print(f"  Improvement: {mae_improvement:.2f}%")

print(f"\nRoot Mean Squared Error (RMSE):")
print(f"  GLM:      {rmse_glm:.6f}")
print(f"  Baseline: {rmse_baseline:.6f}")
print(f"  Improvement: {rmse_improvement:.2f}%")


MODEL vs BASELINE COMPARISON

Mean Absolute Error (MAE):
  GLM:      0.359068
  Baseline: 0.501407
  Improvement: 28.39%

Root Mean Squared Error (RMSE):
  GLM:      4.595436
  Baseline: 4.593912
  Improvement: -0.03%


## Save Predictions to File

In [7]:
# Save predictions to file
pred_file = '../data/processed/predictions_output.csv'
df[['DrivAge', 'VehAge', 'ClaimNb', 'Exposure', 'freq', 'predicted_freq', 'predicted_claims', 'residuals']].to_csv(pred_file, index=False)
print(f"✓ Predictions SAVED to: {pred_file}")

# Display summary
print(f"\nPredictions dataset columns: {df[['DrivAge', 'VehAge', 'ClaimNb', 'Exposure', 'freq', 'predicted_freq', 'predicted_claims', 'residuals']].columns.tolist()}")
print(f"Number of predictions: {len(df):,}")

✓ Predictions SAVED to: ../data/processed/predictions_output.csv

Predictions dataset columns: ['DrivAge', 'VehAge', 'ClaimNb', 'Exposure', 'freq', 'predicted_freq', 'predicted_claims', 'residuals']
Number of predictions: 678,013
