# 1. Load Data Set

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
# load the data set from given path
path_to_data = './processed_data/sev_freq_joined_cliped.csv'
df = pd.read_csv(path_to_data)
print("Data loaded successfully.")

Data loaded successfully.


# Feature Transformation Pipeline
These transformations are based on Case Study: French Motor Third-Party Liability Claims (https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3164764).

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, \
    OrdinalEncoder, PolynomialFeatures, StandardScaler

def gen_col_trans(drop=True, standardize=False):
    """Generate a ColumnTransformer and list of names.
    
    With drop=False and standardize=False, the transformer corresponds to the GLM of the case study paper.
    
    drop = False does encode k categories with k binary features (redundant).
    standardize = True standardizes numerical features.
    """
    # drop dictionary
    dd = {'VehPower': [4],
          'VehAge': [1],
          'DrivAge': [4],
          'VehBrand': ['B1'],
          'VehGas': ['Diesel'],
          'Region': ['R24']}
    if drop is False:
        for key, value in dd.items():
            dd[key] = None
    column_trans = ColumnTransformer(
    [
    # VehPower 4, 5, 6, 7, 8, 9, drop=4
    ('VehPower_cat',
      Pipeline([('cut_9', FunctionTransformer(lambda x: np.minimum(x, 9), validate=False)),
                ('OHE', OneHotEncoder(categories='auto', drop=dd['VehPower']))]),
      ['VehPower']),

     # VehAge intervals [0,1), [1, 10], (10, inf), drop=[1,10]
     ('VehAge_cat',
      Pipeline([('bin', FunctionTransformer(lambda x: np.digitize(np.where(x==10, 9, x), bins=[1,10]), validate=False)),
                ('OHE', OneHotEncoder(categories='auto', drop=dd['VehAge']))]),
      ['VehAge']),

     # DrivAge intervals [18,21), [21,26), [26,31), [31,41), [41,51), [51,71),[71,∞), drop=[41,51)
     ('DrivAge_cat',
      Pipeline([('bin', FunctionTransformer(lambda x: np.digitize(x, bins=[21, 26, 31, 41, 51, 71]), validate=False)),
                ('OHE', OneHotEncoder(categories='auto', drop=dd['DrivAge']))]),
      ['DrivAge']),

     ('BonusMalus',
      Pipeline([('cutat150', FunctionTransformer(lambda x: np.minimum(x, 150), validate=False))] + ([('norm', StandardScaler())] if standardize else [])),
      ['BonusMalus']),

     ('VehBrand_cat', OneHotEncoder(drop=dd['VehBrand']), ['VehBrand']),

     ('VehGas_Regular', OneHotEncoder(drop=dd['VehGas']), ['VehGas']),

     ('Density_log',
      Pipeline([('log', FunctionTransformer(lambda x: np.log(x), validate=False))] + ([('norm', StandardScaler())] if standardize else [])),
      ['Density']),

     ('Region_cat', OneHotEncoder(drop=dd['Region']), ['Region']), 

     ('Area_ord', 
     Pipeline([('OE', OrdinalEncoder()), ('plus_1', FunctionTransformer(lambda x: x+1, validate=False))] + ([('norm', StandardScaler())] if standardize else [])),
      ['Area']), 
    ],
    remainder = 'drop')
    column_trans_names = ['VehPower_4', 'VehPower_5', 'VehPower_6',
                          'VehPower_7', 'VehPower_8', 'VehPower_9',
                          'VehAge_[0,1)', 'VehAge_[1, 10]', 'VehAge_(10,inf)',
                          'DrivAge_[18,21)', 'DrivAge_[21,26)', 'DrivAge_[26,31)',
                          'DrivAge_[31,41)', 'DrivAge_[41,51)', 'DrivAge_[51,71)', 'DrivAge_[71,inf)',
                          'BonusMalus',
                          'VehBrand_B10', 'VehBrand_B11', 'VehBrand_B12',
                          'VehBrand_B13', 'VehBrand_B14', 'VehBrand_B1',
                          'VehBrand_B2', 'VehBrand_B3', 'VehBrand_B4',
                          'VehBrand_B5', 'VehBrand_B6',
                          'VehGas_Diesel', 'VehGas_Regular',
                          'Density_log',
                          'Region_R11', 'Region_R21', 'Region_R22', 'Region_R23',
                          'Region_R24', 'Region_R25', 'Region_R26', 'Region_R31',
                          'Region_R41', 'Region_R42', 'Region_R43', 'Region_R52',
                          'Region_R53', 'Region_R54', 'Region_R72', 'Region_R73',
                          'Region_R74', 'Region_R82', 'Region_R83', 'Region_R91',
                          'Region_R93', 'Region_R94',
                          'Area_ord']
    if drop:
        column_trans_names = [i for i in column_trans_names if i not in
                              ['VehPower_4', 'VehAge_[1, 10]', 'DrivAge_[41,51)',
                               'VehBrand_B1', 'VehGas_Diesel', 'Region_R24']]
    return column_trans, column_trans_names

In [4]:
col_tranformer, col_transformer_names = gen_col_trans(drop=False, standardize=False)
X = col_tranformer.fit_transform(df)

y = df[['PurePremium', 'Frequency', 'AvgClaimAmount', 'Exposure']].values

In [13]:
from sklearn.model_selection import train_test_split
df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0)

df_train = pd.DataFrame(df_train, columns = df.columns)
df_test = pd.DataFrame(df_test, columns = df.columns)

In [15]:
df_train.to_csv('df_train.csv', index=False)
df_test.to_csv('df_test.csv', index=False)

np.save('X_train.npy', X_train.toarray())
np.save('X_test.npy', X_test.toarray())
