# Imports

In [None]:
### imports pt1
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# load from file
from joblib import load

In [None]:
### imports pt2
# sklearn stuff
# NB! depending on your setup it may be necessary to install C++ build tools in order to install sklearn
# this is possible through the VS installer as (roughly) described here: https://wiki.python.org/moin/WindowsCompilers
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance, explained_variance_score

# Definitions

In [None]:
### load csv and format it appropriately
def load_dataset(file='Case_study_data_v1.csv'):
    df = pd.read_csv(file, sep=';', dtype={
        'parameter_1':'category',
        'parameter_5':'category',
        'parameter_6':'category',
        'parameter_7':'category',
        'parameter_8':'category',
        'parameter_9':'category'
    })
    
    return df

In [None]:
### check distribution of variables, plus claim rate per value of variable
def check_parameter_predict(df, par):
    df_agg = df.groupby(par, as_index=False).sum()
    df_agg['claims_rate'] = df_agg['nr_claims']/df_agg['exposure']
    df_agg['claims_rate_predict'] = df_agg['nr_claims_predict']/df_agg['exposure']

    print(df_agg[[par,'nr_claims','nr_claims_predict','claims_rate','claims_rate_predict']])

    fig, axes = plt.subplots(1,2,figsize=(15,5))
    
    # plot predicted rates vs observed rates in each category
    idx_series = pd.Index(df_agg[par])
    df_series = df_agg[['claims_rate','claims_rate_predict']].set_index(idx_series)

    sns.scatterplot(data=df_series, ax=axes[0])
    
    # the normal plot from check_training()
    sns.barplot(data=df_agg, x=par, y='exposure', ax=axes[1])
    # a bit of a seaborn hack for two plots on same x-axis
    sns.scatterplot(data=df_agg, x=np.arange(0,len(df_agg)), y='claims_rate', ax=axes[1].twinx())    
    
    plt.show()
    
    return df_agg

In [None]:
def prep_data(df):
    df['claims_rate'] = df['nr_claims']/df['exposure']
    # exposure zero. could aggregate across features, which would make sense if those rows had exp>0
    # i can't tell that from the data though, so i just remove these rows
    df = df[df['exposure']>0]
    
    X = df.drop(columns=['exposure','nr_claims','claims_rate'])
    y, w = df['claims_rate'], df['exposure']
    
    return X, y, w

In [None]:
def score_model(model, X, y, w):
    y_predict = model.predict(X)
    
    score_functions = {#'MPD':'mean_tweedie_deviance',
           'EVS':'explained_variance_score',
           'mean absolute error':'mean_absolute_error',
           'mean squared error':'mean_squared_error'}
    
    values = map(lambda f : f(y , y_predict, sample_weight=w),
                 map(eval, score_functions.values())
                )
    
    score = dict(zip(score_functions.keys(),values))

    # built-in D2
    score['Built in D2'] = model.score(X, y, sample_weight=w)
    
    # MPD
    score['MPD'] = mean_tweedie_deviance(y, y_predict, sample_weight=w, power=1)
    
    return score, y_predict

## Program

In [None]:
# the file to import as scoring data
score_data_file = 'test_data.csv'

model = load('model.joblib')
tf = load('transformer.joblib')

df = load_dataset(score_data_file)
X, y, w = prep_data(df)
X_tf = tf.transform(X)

In [None]:
# score model
metrics, y_predict = score_model(model, X_tf, y, w)

In [None]:
metrics

## Visualise prediction vs observed

In [None]:
# compile scoring data and predictions
X_full = pd.concat([X, w], axis=1).reset_index(drop=True)

X_full['claims_rate'] = pd.Series(y.reset_index(drop=True))
X_full['claims_rate_predict'] = pd.Series(y_predict)
X_full['nr_claims'] = X_full['claims_rate'] * X_full['exposure']
X_full['nr_claims_predict'] = X_full['claims_rate_predict'] * X_full['exposure']

In [None]:
# visually compare predictions to training data
X_agg = check_parameter_predict(X_full,'parameter_9')