# Final Summary Report

Comprehensive summary of the auto insurance pricing model analysis with key findings and recommendations.

In [54]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

print('âœ“ Libraries imported')

âœ“ Libraries imported


In [55]:
print(f"\nðŸ’¡ RECOMMENDATIONS:")
print(f"\n1. PRICING: Use predicted frequencies for premium calculation")
print(f"2. RISK: Driver age and vehicle characteristics are key differentiators")
print(f"3. MONITORING: Track model performance monthly and update quarterly")
print(f"4. STRATEGY: Leverage lift rankings for underwriting and pricing")

print(f"\n" + "="*80)
print("âœ¨ ANALYSIS COMPLETE âœ¨")
print("="*80)


ðŸ’¡ RECOMMENDATIONS:

1. PRICING: Use predicted frequencies for premium calculation
2. RISK: Driver age and vehicle characteristics are key differentiators
3. MONITORING: Track model performance monthly and update quarterly
4. STRATEGY: Leverage lift rankings for underwriting and pricing

âœ¨ ANALYSIS COMPLETE âœ¨


## Key Recommendations

In [56]:
print(f"\nðŸ’¾ OUTPUT FILES GENERATED:")
print(f"  âœ“ Processed Data: ../data/processed/processed.csv")
print(f"  âœ“ Predictions: ../data/processed/predictions_output.csv")
print(f"  âœ“ Visualizations:")
print(f"    - ../results/01_EDA_Frequency_Distributions.png")
print(f"    - ../results/02_EDA_Key_Insights.png")
print(f"    - ../results/04_Lift_Analysis_Performance.png")
print(f"    - ../results/05_Model_Diagnostics_Residuals.png")
print(f"  âœ“ Model Summary: ../results/03_GLM_Model_Summary.txt")


ðŸ’¾ OUTPUT FILES GENERATED:
  âœ“ Processed Data: ../data/processed/processed.csv
  âœ“ Predictions: ../data/processed/predictions_output.csv
  âœ“ Visualizations:
    - ../results/01_EDA_Frequency_Distributions.png
    - ../results/02_EDA_Key_Insights.png
    - ../results/04_Lift_Analysis_Performance.png
    - ../results/05_Model_Diagnostics_Residuals.png
  âœ“ Model Summary: ../results/03_GLM_Model_Summary.txt


## Output Files

In [57]:
df = pd.read_csv("../data/processed/predictions_output.csv")

df_sorted = df.sort_values('predicted_freq', ascending=False).reset_index(drop=True)
df_sorted['row_num'] = np.arange(1, len(df_sorted) + 1)
df_sorted['cumsum_claims'] = df_sorted['ClaimNb'].cumsum()
df_sorted['cumsum_exposure'] = df_sorted['Exposure'].cumsum()
df_sorted['percentile'] = (df_sorted['row_num'] / len(df_sorted)) * 100
df_sorted['cumsum_freq_actual'] = df_sorted['cumsum_claims'] / df_sorted['cumsum_exposure']
overall_freq = df['ClaimNb'].sum() / df['Exposure'].sum()
df_sorted['lift'] = df_sorted['cumsum_freq_actual'] / overall_freq

print(f"\nðŸ“Š LIFT CHART INSIGHTS:")
for p in [10, 20, 30]:
    idx = min(int(np.ceil(p * len(df_sorted) / 100)) - 1, len(df_sorted) - 1)
    lift_val = df_sorted.loc[idx, 'lift']
    print(f"  â€¢ Top {p}% of portfolio: {lift_val:.2f}x lift")
print(f"\n  â†’ Model successfully identifies and ranks high-risk policies")


ðŸ“Š LIFT CHART INSIGHTS:
  â€¢ Top 10% of portfolio: 1.84x lift
  â€¢ Top 20% of portfolio: 1.67x lift
  â€¢ Top 30% of portfolio: 1.50x lift

  â†’ Model successfully identifies and ranks high-risk policies


## Lift Analysis

In [58]:
mae_glm = np.mean(np.abs(df['freq'] - df['predicted_freq']))
mae_baseline = np.mean(np.abs(df['freq'] - df['freq'].mean()))
mae_improvement = ((mae_baseline - mae_glm) / mae_baseline * 100)

rmse_glm = np.sqrt(np.mean((df['freq'] - df['predicted_freq'])**2))
rmse_baseline = np.sqrt(np.mean((df['freq'] - df['freq'].mean())**2))
rmse_improvement = ((rmse_baseline - rmse_glm) / rmse_baseline * 100)

print(f"\nðŸ“ˆ GLM MODEL PERFORMANCE:")
print(f"  â€¢ Model Type: Poisson GLM with exposure offset")
print(f"  â€¢ MAE Improvement: {mae_improvement:.2f}% vs baseline")
print(f"  â€¢ RMSE Improvement: {rmse_improvement:.2f}% vs baseline")
print(f"  â€¢ MAE (GLM): {mae_glm:.6f} vs Baseline: {mae_baseline:.6f}")
print(f"  â€¢ RMSE (GLM): {rmse_glm:.6f} vs Baseline: {rmse_baseline:.6f}")


ðŸ“ˆ GLM MODEL PERFORMANCE:
  â€¢ Model Type: Poisson GLM with exposure offset
  â€¢ MAE Improvement: 28.39% vs baseline
  â€¢ RMSE Improvement: -0.03% vs baseline
  â€¢ MAE (GLM): 0.359068 vs Baseline: 0.501407
  â€¢ RMSE (GLM): 4.595436 vs Baseline: 4.593912


## Model Performance

In [59]:
df_raw = pd.read_csv("../data/raw/freMTPL2freq.csv") 
df['DrivAge_group'] = pd.cut(df['DrivAge'], bins=[17, 25, 35, 50, 70, 100], labels=['18-25', '26-35', '36-50', '51-70', '71+'])
df['VehAge_group'] = pd.cut(df['VehAge'], bins=[-1, 5, 10, 20, 50], labels=['0-5', '6-10', '11-20', '21+'])

print("\n" + "="*80)
print("AUTO INSURANCE PRICING MODEL - FINAL SUMMARY")
print("="*80)

print(f"\nðŸ“Š DATASET STATISTICS:")
print(f"  â€¢ Original records: {len(df_raw):,}")
print(f"  â€¢ Cleaned records: {len(df):,}")
print(f"  â€¢ Total claims: {df['ClaimNb'].sum():,.0f}")
print(f"  â€¢ Total exposure: {df['Exposure'].sum():,.2f}")
print(f"  â€¢ Overall frequency: {df['ClaimNb'].sum() / df['Exposure'].sum():.6f}")

print(f"\nðŸ”§ FEATURE ENGINEERING:")
print(f"  â€¢ Driver Age Groups: {df['DrivAge_group'].nunique() if 'DrivAge_group' in df.columns else 'N/A'} categories")
print(f"  â€¢ Vehicle Age Groups: {df['VehAge_group'].nunique() if 'VehAge_group' in df.columns else 'N/A'} categories")
if 'VehPower' in df.columns:
    print(f"  â€¢ Vehicle Power: {df['VehPower'].nunique()} levels")
else:
    print("  â€¢ Vehicle Power: N/A (column missing)")
if 'Region' in df.columns:
    print(f"  â€¢ Regions: {df['Region'].nunique()} regions")
else:
    print("  â€¢ Regions: N/A (column missing)")



AUTO INSURANCE PRICING MODEL - FINAL SUMMARY

ðŸ“Š DATASET STATISTICS:
  â€¢ Original records: 678,013
  â€¢ Cleaned records: 678,013
  â€¢ Total claims: 36,102
  â€¢ Total exposure: 358,499.45
  â€¢ Overall frequency: 0.100703

ðŸ”§ FEATURE ENGINEERING:
  â€¢ Driver Age Groups: 5 categories
  â€¢ Vehicle Age Groups: 4 categories
  â€¢ Vehicle Power: N/A (column missing)
  â€¢ Regions: N/A (column missing)


## Executive Summary

In [60]:
df_raw = pd.read_csv("../data/raw/freMTPL2freq.csv")
df = pd.read_csv("../data/processed/processed.csv")
predictions = pd.read_csv("../data/processed/predictions_output.csv")

df = pd.concat([df, predictions[['predicted_freq', 'predicted_claims', 'residuals']]], axis=1)
df['standardized_residuals'] = df['residuals'] / df['residuals'].std()

print("âœ“ All analysis results loaded")
print(f"Dataset: {df.shape[0]:,} records with {df.shape[1]} features")

âœ“ All analysis results loaded
Dataset: 678,013 records with 19 features


## Load Analysis Results