## But First, Shall We Prepare the Dataset 

- Claims data includes yearly totals and breakdowns by claim type.
- Aggregate the claims data to obtain the claims frequency unique to each ID and year
- Merge result with rating factors on (ID, year).
- Assign a frequency of 0 to policies with no recorded claims.

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import GeneralizedPoisson
from statsmodels.stats.outliers_influence import OLSInfluence
from sklearn.model_selection import train_test_split
from scipy import stats
import warnings

warnings.filterwarnings('ignore')

In [7]:
pwd

'/Users/olumide/Library/CloudStorage/OneDrive-Personal/Documents/Research/Project 1/underwriting assessor/notebook'

In [8]:
rating_factors = pd.read_csv('../data/input/exp/Motor_vehicle_insurance_data.csv', delimiter=";")
claims =  pd.read_csv('../data/input/exp/sample_type_claim.csv', delimiter=';')

In [9]:
claims_frequency  = (
    claims
    .groupby(['ID', 'Cost_claims_year'])
    .agg({
        'Cost_claims_by_type': 'count'})
    .rename(columns={'Cost_claims_by_type': 'claims_frequency'})
    .reset_index()
)

In [10]:
dataset = (
    pd.merge(
        left=rating_factors,
        right=claims_frequency,
        how='left',
        on=['ID', 'Cost_claims_year']
    )
)
dataset

Unnamed: 0,ID,Date_start_contract,Date_last_renewal,Date_next_renewal,Date_birth,Date_driving_licence,Distribution_channel,Seniority,Policies_in_force,Max_policies,...,Second_driver,Year_matriculation,Power,Cylinder_capacity,Value_vehicle,N_doors,Type_fuel,Length,Weight,claims_frequency
0,1,05/11/2015,05/11/2015,05/11/2016,15/04/1956,20/03/1976,0,4,1,2,...,0,2004,80,599,7068.00,0,P,,190,
1,1,05/11/2015,05/11/2016,05/11/2017,15/04/1956,20/03/1976,0,4,1,2,...,0,2004,80,599,7068.00,0,P,,190,
2,1,05/11/2015,05/11/2017,05/11/2018,15/04/1956,20/03/1976,0,4,2,2,...,0,2004,80,599,7068.00,0,P,,190,
3,1,05/11/2015,05/11/2018,05/11/2019,15/04/1956,20/03/1976,0,4,2,2,...,0,2004,80,599,7068.00,0,P,,190,
4,2,26/09/2017,26/09/2017,26/09/2018,15/04/1956,20/03/1976,0,4,2,2,...,0,2004,80,599,7068.00,0,P,,190,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105550,53498,30/07/2018,30/07/2018,30/07/2019,25/07/1981,14/02/2007,0,1,1,1,...,0,2000,110,1997,24320.00,5,D,4.740,1480,
105551,53499,16/08/2018,16/08/2018,16/08/2019,08/12/1976,29/11/2017,0,1,1,1,...,0,2013,129,1998,30861.97,5,P,4.650,1440,
105552,53500,21/11/2018,21/11/2018,21/11/2019,01/04/1974,05/10/2011,0,1,1,1,...,0,1999,55,999,7800.00,5,P,3.495,830,
105553,53501,21/11/2018,21/11/2018,21/11/2019,15/09/1946,02/02/1982,0,1,1,1,...,0,2004,90,1753,16610.00,5,D,4.555,1399,
