# Classifying Risks Leveraging Poisson Regression Model

- The frequency of claim is the target variable here.
- Minimal risk rating factor that can be leveraged to classify risks into frequency bands will be a good outcome to nail down
- Make the model an inference service that can bedeployed

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## But First, Shall We Prepare the Dataset 

- Claims data includes yearly totals and breakdowns by claim type.
- Aggregate the claims data to obtain the claims frequency unique to each ID and year
- Merge result with rating factors on (ID, year).
- Assign a frequency of 0 to policies with no recorded claims.

In [3]:
rating_factors = pd.read_csv('../data/input/exp/Motor vehicle insurance data.csv', delimiter=";")
claims =  pd.read_csv('../data/input/exp/sample type claim.csv', delimiter=';')

In [4]:
claims_frequency  = (
    claims
    .groupby(['ID', 'Cost_claims_year'])
    .agg({
        'Cost_claims_by_type': 'count'})
    .rename(columns={'Cost_claims_by_type': 'claims_frequency'})
    .reset_index()
)

In [None]:
dataset = (
    pd.merge(
        left=rating_factors,
        right=claims_frequency,
        how='left',
        on=['ID', 'Cost_claims_year']
    ).fillna
)
dataset.describe()

Unnamed: 0,ID,Distribution_channel,Seniority,Policies_in_force,Max_policies,Max_products,Lapse,Payment,Premium,Cost_claims_year,...,Area,Second_driver,Year_matriculation,Power,Cylinder_capacity,Value_vehicle,N_doors,Length,Weight,claims_frequency
count,105555.0,105555.0,105555.0,105555.0,105555.0,105555.0,105555.0,105555.0,105555.0,105555.0,...,105555.0,105555.0,105555.0,105555.0,105555.0,105555.0,105555.0,95226.0,105555.0,5258.0
mean,26271.286789,0.45131,6.696604,1.455649,1.837232,1.065842,0.221837,0.31918,315.892557,153.557305,...,0.273895,0.123708,2004.728038,92.682611,1617.759367,18413.657243,4.067898,4.252007,1191.262422,1.401483
std,15388.309324,0.497626,6.263911,0.928427,1.155536,0.267807,0.464858,0.466161,140.927969,1477.112362,...,0.445958,0.32925,6.767037,37.012645,604.697382,9135.074235,1.511839,0.39322,458.081834,0.750559
min,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,40.14,0.0,...,0.0,0.0,1950.0,0.0,49.0,270.46,0.0,1.978,43.0,1.0
25%,12925.0,0.0,3.0,1.0,1.0,1.0,0.0,0.0,241.61,0.0,...,0.0,0.0,2001.0,75.0,1390.0,13127.21,3.0,3.999,1043.0,1.0
50%,26082.0,0.0,4.0,1.0,2.0,1.0,0.0,0.0,292.28,0.0,...,0.0,0.0,2005.0,90.0,1598.0,17608.77,5.0,4.23,1205.0,1.0
75%,39754.0,1.0,9.0,2.0,2.0,1.0,0.0,1.0,361.64,0.0,...,1.0,0.0,2008.0,110.0,1910.0,22595.0,5.0,4.443,1388.0,2.0
max,53502.0,1.0,40.0,17.0,17.0,4.0,7.0,1.0,2993.34,260853.24,...,1.0,1.0,2018.0,580.0,7480.0,220675.8,6.0,8.218,7300.0,9.0


## Claims rate per unit exposure
- The data is structured such that each record is policy that runs for a year, as such let's calculate the claims rate per unit exposure (policy length)
- Creating matrix a using - Age, driving lenght, insurable object type (e.g van, motorcycle etc.)
    - Age banded into 3 levels of 0-35, 36-70, >70 as 0,1,2
    - Driving length banded into 2 levels of less than 5 years of experience and greater than 5 years experience as 0,1
    - Type risk has levels 1,2,3,4 representing motorbikes, van, passengers and agricultural vehicles respectively

In [41]:
rating_factors_a = (
    dataset
    .copy()[['Date_driving_licence','Date_birth','Type_risk', 'claims_frequency','Cost_claims_year']]
    .fillna({'Date_driving_licence': pd.NaT,  'claims_frequency': 0})
    .assign(age_band=lambda x: pd.cut((pd.Timestamp.now() - pd.to_datetime(x['Date_birth'], dayfirst=True)).dt.days // 365, 
                                 bins=[0, 35, 70, float('inf')], 
                                 labels=[0, 1, 2], 
                                 include_lowest=True).astype(int))
    .assign(driving_length=lambda x:((pd.Timestamp.now() - pd.to_datetime(x['Date_driving_licence'], dayfirst=True)).dt.days // 365 >= 5).astype(int))
    .drop(columns=['Date_driving_licence', 'Date_birth'], axis=1)
)
rating_factors_a

Unnamed: 0,Type_risk,claims_frequency,Cost_claims_year,age_band,driving_length
0,1,0.0,0.0,1,1
1,1,0.0,0.0,1,1
2,1,0.0,0.0,1,1
3,1,0.0,0.0,1,1
4,1,0.0,0.0,1,1
...,...,...,...,...,...
105550,3,0.0,0.0,1,1
105551,3,0.0,0.0,1,1
105552,3,0.0,0.0,1,1
105553,2,0.0,0.0,2,1


In [42]:
rating_factors_a_matrix = (
    rating_factors_a
    .groupby(['age_band', 'driving_length', 'Type_risk'])
    .agg({'claims_frequency':'sum', 'Cost_claims_year':'sum'})
    .assign(claims_rate_per_unit_exposure=lambda x:(x['claims_frequency']/x['Cost_claims_year']))
    .reset_index()
)
rating_factors_a_matrix

Unnamed: 0,age_band,driving_length,Type_risk,claims_frequency,Cost_claims_year,claims_rate_per_unit_exposure
0,0,1,1,27.0,51431.88,0.000525
1,0,1,2,24.0,49850.71,0.000481
2,0,1,3,496.0,1423505.43,0.000348
3,0,1,4,0.0,0.0,
4,1,1,1,209.0,224939.07,0.000929
5,1,1,2,822.0,1917519.21,0.000429
6,1,1,3,5111.0,11103401.38,0.00046
7,1,1,4,0.0,192.67,0.0
8,2,1,1,16.0,32685.29,0.00049
9,2,1,2,141.0,230383.4,0.000612
