# Imports

In [None]:
### imports pt1
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# save to file
from joblib import dump

In [None]:
### imports pt2
# sklearn stuff
# NB! depending on your setup it may be necessary to install C++ build tools in order to install sklearn
# this is possible through the VS installer as (roughly) described here: https://wiki.python.org/moin/WindowsCompilers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import PoissonRegressor

## Definitions

In [None]:
### load csv and format it appropriately
def load_dataset(file='Case_study_data_v1.csv'):
    df = pd.read_csv(file, sep=';', dtype={
        'parameter_1':'category',
        'parameter_5':'category',
        'parameter_6':'category',
        'parameter_7':'category',
        'parameter_8':'category',
        'parameter_9':'category'
    })
    
    return df

In [None]:
### check distribution of variables, plus claim rate per value of variable
def check_parameter(df, par):
    df_agg = df.groupby(par, as_index=False).sum()
    df_agg['claims_rate'] = df_agg['nr_claims']/df_agg['exposure']

    print(df_agg[[par,'exposure','nr_claims','claims_rate']])

    fig, ax1 = plt.subplots()
    ax2 = ax1.twinx()
    
    sns.barplot(data=df_agg, x=par, y='exposure', ax=ax1)
    # a bit of a seaborn hack for two plots on same x-axis
    sns.scatterplot(data=df_agg, x=np.arange(0,len(df_agg)), y='claims_rate', ax=ax2)
    
    plt.show()
    
    return df_agg

In [None]:
### check distribution of variables, plus claim rate per value of variable
def check_parameter_predict(df, par):
    df_agg = df.groupby(par, as_index=False).sum()
    df_agg['claims_rate'] = df_agg['nr_claims']/df_agg['exposure']
    df_agg['claims_rate_predict'] = df_agg['nr_claims_predict']/df_agg['exposure']

    print(df_agg[[par,'nr_claims','nr_claims_predict','claims_rate','claims_rate_predict']])

    fig, axes = plt.subplots(1,2,figsize=(15,5))
    
    # plot predicted rates vs observed rates in each category
    idx_series = pd.Index(df_agg[par])
    df_series = df_agg[['claims_rate','claims_rate_predict']].set_index(idx_series)

    sns.scatterplot(data=df_series, ax=axes[0])
    
    # the normal plot from check_training()
    sns.barplot(data=df_agg, x=par, y='exposure', ax=axes[1])
    # a bit of a seaborn hack for two plots on same x-axis
    sns.scatterplot(data=df_agg, x=np.arange(0,len(df_agg)), y='claims_rate', ax=axes[1].twinx())    
    
    plt.show()
    
    return df_agg

In [None]:
def prep_data(df):
    df['claims_rate'] = df['nr_claims']/df['exposure']
    # exposure zero. could aggregate across features, which would make sense if those rows had exp>0
    # i can't tell that from the data though, so i just remove these rows
    df = df[df['exposure']>0]
    
    X = df.drop(columns=['exposure','nr_claims','claims_rate'])
    y, w = df['claims_rate'], df['exposure']
    
    return X, y, w

In [None]:
### split dataset into train/score (if necessary) and rescale/encode variables
def split_prep_data(df, num_feat, cat_feat, split=True):
    X, y, w = prep_data(df)
    
    big_transformer = ColumnTransformer(
    [
        # want to rescale numerical features to mean=0 var=1
        ('num', StandardScaler(), num_feat),
        
        # want to one-hot encode categorical features
        # TODO: OneHotEncoder() supports array argument to drop= 
            # this will specify which category should be dropped (baseline category)
            # ideally this would be category with most exposure
        ('cat', OneHotEncoder(handle_unknown='ignore',drop='first'), cat_feat)
    ]
    )

    if split:
        X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, test_size=0.25, random_state=1337)
    else:
        X_train, X_test, y_train, y_test, w_train, w_test = X, None, y, None, w, None
    
    big_transformer.fit(X_train)
    
    return X_train, X_test, y_train, y_test, w_train, w_test, big_transformer

In [None]:
def train_model(X_train, y_train, w_train):
    # log-link Poisson GLM
    model = PoissonRegressor(max_iter=5000, alpha=1e-3)
    
    model.fit(X_train, y_train, sample_weight=w_train)
    
    return model

# Load dataset and look at it

In [None]:
df = load_dataset()

In [None]:
df

In [None]:
df.info()
df.describe()

In [None]:
# checking how exposure/varies across variables
df_agg = check_parameter(df, 'parameter_4')

# Program

In [None]:
# set to False to use ALL of case data for training. otherwise test data is saved
split_case_data = True

num_feat = ['parameter_2', 'parameter_3', 'parameter_4']
# ignore parameter_1 for now as it is too messy (groups too small).
# if i knew what it represented i could possibly do some groupings
cat_feat = ['parameter_5', 'parameter_6', 'parameter_7', 'parameter_8', 'parameter_9']

X_train, X_test, y_train, y_test, w_train, w_test, tf = split_prep_data(df, num_feat, cat_feat, split=split_case_data)

X_train_tf = tf.transform(X_train)

model = train_model(X_train_tf, y_train, w_train)

In [None]:
# compile training data and predictions
X_train_full = pd.concat([X_train, w_train], axis=1).reset_index(drop=True)

X_train_full['claims_rate'] = pd.Series(y_train.reset_index(drop=True))
X_train_full['claims_rate_predict'] = pd.Series(model.predict(X_train_tf))
X_train_full['nr_claims'] = X_train_full['claims_rate'] * X_train_full['exposure']
X_train_full['nr_claims_predict'] = X_train_full['claims_rate_predict'] * X_train_full['exposure']

In [None]:
# visually compare predictions to training data
train_df_agg = check_parameter_predict(X_train_full, 'parameter_8')

## Save model (and optionally scoring data)

In [None]:
#save model and transformer
dump(model, 'model.joblib')
dump(tf, 'transformer.joblib')

if split_case_data:
    #save scoring data. same structure as in initial file
    df_test = pd.concat([X_test, w_test, pd.Series(y_test*w_test,dtype='int64').to_frame('nr_claims')], axis=1)
    df_test.to_csv('test_data.csv', sep=';', index=False)
    
    #save scoring data. same structure as in initial file
    df_train = pd.concat([X_train, w_train, pd.Series(y_train*w_train,dtype='int64').to_frame('nr_claims')], axis=1)
    df_train.to_csv('train_data.csv', sep=';', index=False)

## Extra stuff

In [None]:
(9+5+4+5+2)+3-5

In [None]:
# number of variables in final matrix
print(X_train_tf.shape)

# unique values in all fields
print(df.nunique())
# [#Cats in all chosen cat. features] + [#chosen num. features] - [#dropped categories]
(9+5+4+5+2)+3-5
# this number should be the same as number of columns in the transformed training data
# parameter_1 fails this sanity check because some categories are very small