# Implementation

## Packages

In [31]:
# data elaboration functions
import pandas as pd
import string
import numpy as np

# datetime functions
import datetime as dt

# file management functions
import os
import sys
import opendatasets as od
import pickle
from pathlib import Path

# plot functions
import matplotlib.pyplot as plt
%matplotlib inline

# data science functions
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import joblib
from sklearn.metrics import mean_absolute_error

# configuration file
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from Configuration.config import cfg_path

# custom functions
from Code.Plotting.plots import Plots
from Code.Regressors.regressors import Regressors
from Code.Regressors.temperatures import Temperatures
from Code.Scoring.scoring import Scoring
from Code.Scoring.train_test import TrainTest
from Code.Scoring.train import Training
from Code.Scoring.forecast import Forecasting
from Code.Scoring.kpi import Kpi
from Code.Scoring.scoring import Scoring
from Code.Utils.utils import Utils


## Setup

In [32]:
root = Path(os.getcwd()).parent
dataset_path = os.path.join(root, cfg_path.data_dir.input_path)

## Load Data


In [33]:
df_train_data = pd.read_csv(os.path.join(
    root, cfg_path.data_dir.input_path, 'insurance-claims.csv'))
df_train_data.head()

Unnamed: 0,ClaimNumber,ClaimIndicator,AccidentDescription,ClaimsAssessor,ClaimStatus,DateCaptured,DateOfAccident,DateOfDeath,DateReceived,EmployeeNumber,...,Sum of OutstandingPDValue,Sum of OutstandingRecoveriesValue,Sum of PaidClaimsCost,Sum of PaidDays,Sum of PaidDaysValue,Sum of PaidFatalValue,Sum of PaidMedicalValue,Sum of PaidPD,Sum of PaidPDValue,Sum of PaidRecoveriesValue
0,C/1298438/2/001A/10/EMP,Closed / Completed,(blank),Helanie Barnard,Claim has been Acknowledged,06-01-11 2:48:30 PM,28-12-10 9:00:00 AM,,05-01-11,GB082406,...,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0
1,C/1298438/3/001A/10/EMP,Closed / Completed,(blank),Helanie Barnard,Claim has been Acknowledged,06-01-11 2:55:40 PM,28-12-10 9:00:00 AM,,05-01-11,GB080519,...,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0
2,C/1298438/4/001A/10/EMP,Closed / Completed,(blank),Helanie Barnard,Claim has been Acknowledged,06-01-11 3:03:36 PM,28-12-10 9:00:00 AM,,05-01-11,HB015261,...,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0
3,C/1298486/1/099Q/10/EMP,Closed / Completed,(blank),Petro Seaward,Submitted,03-01-11 3:20:04 PM,31-12-10 6:30:00 AM,,03-01-11,(blank),...,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0
4,C/1298490/1/061A/10/EMP,Closed / Completed,(blank),Petro Seaward,Claim has been Acknowledged,04-01-11 8:43:41 AM,11-12-10 11:30:00 AM,,03-01-11,(blank),...,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0


# Data Preparation


## Parameter setup

In [34]:
id = 'ICD10Description'
list_unique_id = ['ICD10Description', 'DateOfAccident']
list_temp = []
y = 'Sum of PaidDaysValue'

#### Setting date as datetime

In [35]:
df_train_data['DateOfAccident'] = pd.to_datetime(df_train_data['DateOfAccident'], format = '%d-%m-%y %H:%M:%S %p')

#### Setting forecast end date

In [36]:
# Make sure to have all regressors available until forecast_end_date (temperatures, etc)
forecast_end_date = '2022-12-31'

## Plotting y series

In [37]:
# Print available ids and choose which one to plot 
print(list(df_train_data[id].unique())[0:20])

['Heat cramp', 'Superficial injury of wrist/hand', 'Open wound head', 'Open wound of parts of foot', 'Open wound head, Multiple open wounds of forearm, Contusion of knee', 'Open wound other parts of wrist/hand', 'Contusion other parts of foot', 'Open wound lower leg', 'Foreign body cornea', 'Fracture of finger', 'Open wound forearm', 'Open wound forearm, Injury of muscle and tendons', 'Open wound of upper arm', 'Open wound of finger(s)/thumb', 'Contusion of thigh', 'Dermatitis', 'Fracture of rib', 'Superficial injury of other parts of head', 'Noise Induced Hearing Loss, Noise Induced Hearing Loss', 'Effects of electric current']


In [38]:
# Adjusting id names by removing special characters
import re
df_train_data.loc[:, id] = df_train_data.loc[:, id].apply(lambda x: re.sub('[^A-Za-z0-9]+', '_', x))


In [39]:
# Selecting 100 ids to plot
list_ids_to_plot = list(df_train_data[id].unique()[0:100])

In [40]:
count = 1
for i in list_ids_to_plot:
    print('Plotting id:', i, 'as', count, 'of', len(list_ids_to_plot))
    plot = Plots.sliding_line_plot(df_train_data, y, id, i, chart_title="")
    plot.write_html(os.path.join(root, cfg_path.data_dir.plot_path, id + '_' + str(i) + ".html"))
    count = count + 1 

Plotting id: Heat_cramp as 1 of 100
find_date, date_col found: ['DateOfAccident']
sliding_line_plot: plotting Sum of paiddaysvalue ICD10Description Heat_cramp
Plotting id: Superficial_injury_of_wrist_hand as 2 of 100
find_date, date_col found: ['DateOfAccident']
sliding_line_plot: plotting Sum of paiddaysvalue ICD10Description Superficial_injury_of_wrist_hand
Plotting id: Open_wound_head as 3 of 100
find_date, date_col found: ['DateOfAccident']
sliding_line_plot: plotting Sum of paiddaysvalue ICD10Description Open_wound_head
Plotting id: Open_wound_of_parts_of_foot as 4 of 100
find_date, date_col found: ['DateOfAccident']
sliding_line_plot: plotting Sum of paiddaysvalue ICD10Description Open_wound_of_parts_of_foot
Plotting id: Open_wound_head_Multiple_open_wounds_of_forearm_Contusion_of_knee as 5 of 100
find_date, date_col found: ['DateOfAccident']
sliding_line_plot: plotting Sum of paiddaysvalue ICD10Description Open_wound_head_Multiple_open_wounds_of_forearm_Contusion_of_knee
Plottin

## Dealing with NAs and aggregating at a chosen frequency

Create a full time sequence on a chosen frequency and aggregate

#### Consumption data (y)

In [41]:
# Selecting 100 ids to elaborate
df_train_data = df_train_data.loc[df_train_data[id].isin(list_ids_to_plot), ]
date_var = Utils.find_date(df_train_data)
print('List ids:', list_ids_to_plot)
len(list_ids_to_plot)

find_date, date_col found: ['DateOfAccident']
List ids: ['Heat_cramp', 'Superficial_injury_of_wrist_hand', 'Open_wound_head', 'Open_wound_of_parts_of_foot', 'Open_wound_head_Multiple_open_wounds_of_forearm_Contusion_of_knee', 'Open_wound_other_parts_of_wrist_hand', 'Contusion_other_parts_of_foot', 'Open_wound_lower_leg', 'Foreign_body_cornea', 'Fracture_of_finger', 'Open_wound_forearm', 'Open_wound_forearm_Injury_of_muscle_and_tendons', 'Open_wound_of_upper_arm', 'Open_wound_of_finger_s_thumb', 'Contusion_of_thigh', 'Dermatitis', 'Fracture_of_rib', 'Superficial_injury_of_other_parts_of_head', 'Noise_Induced_Hearing_Loss_Noise_Induced_Hearing_Loss', 'Effects_of_electric_current', 'Superficial_injuries_of_abdomen_lower_back_pelvis', 'Contusion_other_parts_of_wrist_hand', 'Contusion_other_parts_of_foot_Fracture_of_toe', 'Contusion_of_finger_s_thumb', 'Superficial_injury_of_chest_wall', 'Noise_Induced_Hearing_Loss', 'Open_wound_forearm_Open_wound_of_finger_s_thumb', 'Open_wound_of_shoulder

100

In [42]:
# Resampling function aggregates data in a dataframe with a chosen function, that can vary depending on the variable
# i.e. temperatures when aggregated should be averaged, consumption should be summed, dummy variables should be pick as 'first'

df_train_data[date_var].apply(lambda x: x.tz_localize(None))
sampling = dt.timedelta(days=1)
dict_grouping = {'RmaRegionDesc': 'first', 'Product': 'first', 'Sum of PaidDaysValue': 'sum'}
df_resampled = Utils.resample_data(df_train_data, id, date_var, sampling, dict_grouping)
print('List ids after resampling:', list(df_resampled[id].unique()))

resample_data: variable RmaRegionDesc
resample_data: variable RmaRegionDesc completed
resample_data: variable Product
resample_data: variable Product completed
resample_data: variable Sum of PaidDaysValue
resample_data: variable Sum of PaidDaysValue completed
       DateOfAccident  RmaRegionDesc  \
0          1982-10-24        Witbank   
1          1982-10-25           None   
2          1982-10-26           None   
3          1982-10-27           None   
4          1982-10-28           None   
...               ...            ...   
119735     2011-06-08           None   
119736     2011-06-09           None   
119737     2011-06-10           None   
119738     2011-06-11      Kimberley   
119739     2011-02-01  Carletonville   

                                         ICD10Description  \
0                                    Amputation_of_finger   
1                                    Amputation_of_finger   
2                                    Amputation_of_finger   
3              

In [43]:
# Adding a full time sequence
df_train_data = Utils.add_seq(df_resampled, date_var, serie = id, freq = sampling, end_date=forecast_end_date, start_date='')

Adding sequence to serie Amputation_of_finger as 1 of 100
Adding sequence to serie Fracture_of_finger as 2 of 100
Adding sequence to serie Fatal_due_to_Accident_Any_ as 3 of 100
Adding sequence to serie Noise_Induced_Hearing_Loss as 4 of 100
Adding sequence to serie Open_wound_head as 5 of 100
Adding sequence to serie Contusion_of_finger_s_thumb as 6 of 100
Adding sequence to serie Foreign_body_cornea as 7 of 100
Adding sequence to serie Sprain_and_strain_of_cervical_spine as 8 of 100
Adding sequence to serie Other_respiratory_conditions as 9 of 100
Adding sequence to serie Contusion_of_knee as 10 of 100
Adding sequence to serie Contusion_of_shoulder_upper_arm as 11 of 100
Adding sequence to serie Sprain_collateral_ligament_of_knee as 12 of 100
Adding sequence to serie Open_wound_other_parts_of_wrist_hand as 13 of 100
Adding sequence to serie Superficial_injuries_of_abdomen_lower_back_pelvis as 14 of 100
Adding sequence to serie Sprain_and_strain_of_ankle as 15 of 100
Adding sequence t

In [44]:
# This function count the number of obs you should have if you had a full time sequence
Utils.check_length_time_serie(df_train_data, date_var, index = id).head()

Expected length of sequence is OK 
                                      ICD10Description    count  expected_obs
0                   Amputation_between_knee_and_ankle  14679.0       14679.0
1                                Amputation_of_finger  14679.0       14679.0
2                               Amputation_of_one_toe  14679.0       14679.0
3                   Amputation_of_two_or_more_fingers  14679.0       14679.0
4                                   Barotrauma_Otitic  14679.0       14679.0
..                                                ...      ...           ...
95  Superficial_injury_of_lip_oral_cavity_Open_wou...  14679.0       14679.0
96  Superficial_injury_of_lower_leg_Open_wound_hea...  14679.0       14679.0
97          Superficial_injury_of_other_parts_of_head  14679.0       14679.0
98  Superficial_injury_of_other_parts_of_head_Open...  14679.0       14679.0
99                   Superficial_injury_of_wrist_hand  14679.0       14679.0

[100 rows x 3 columns]


Unnamed: 0,ICD10Description,count,min,max,td,freq,expected_obs,mismatch
0,Amputation_between_knee_and_ankle,14679.0,1982-10-24,2022-12-31,14678 days,D,14679.0,0
1,Amputation_of_finger,14679.0,1982-10-24,2022-12-31,14678 days,D,14679.0,0
2,Amputation_of_one_toe,14679.0,1982-10-24,2022-12-31,14678 days,D,14679.0,0
3,Amputation_of_two_or_more_fingers,14679.0,1982-10-24,2022-12-31,14678 days,D,14679.0,0
4,Barotrauma_Otitic,14679.0,1982-10-24,2022-12-31,14678 days,D,14679.0,0


In [45]:
df_train_data.head()

Unnamed: 0,ICD10Description,DateOfAccident,RmaRegionDesc,Product,Sum of PaidDaysValue
0,Amputation_of_finger,1982-10-24,Witbank,IOD Workmans Policy,0.0
1,Amputation_of_finger,1982-10-25,,,0.0
2,Amputation_of_finger,1982-10-26,,,0.0
3,Amputation_of_finger,1982-10-27,,,0.0
4,Amputation_of_finger,1982-10-28,,,0.0


In [46]:
print('List ids after resampling and adding full time sequence:', list(df_train_data[id].unique()))

List ids after resampling and adding full time sequence: ['Amputation_of_finger', 'Fracture_of_finger', 'Fatal_due_to_Accident_Any_', 'Noise_Induced_Hearing_Loss', 'Open_wound_head', 'Contusion_of_finger_s_thumb', 'Foreign_body_cornea', 'Sprain_and_strain_of_cervical_spine', 'Other_respiratory_conditions', 'Contusion_of_knee', 'Contusion_of_shoulder_upper_arm', 'Sprain_collateral_ligament_of_knee', 'Open_wound_other_parts_of_wrist_hand', 'Superficial_injuries_of_abdomen_lower_back_pelvis', 'Sprain_and_strain_of_ankle', 'Open_wound_other_parts_lower_leg', 'Superficial_injury_of_other_parts_of_head', 'Contusion_of_ankle', 'Open_wound_of_finger_s_thumb', 'Sprain_and_strain_of_lumbar_spine', 'Barotrauma_Otitic', 'Contusion_of_elbow', 'Contusion_other_parts_of_wrist_hand', 'Fracture_of_thumb', 'Open_wound_of_knee', 'Open_wound_of_back_wall_of_thorax', 'Superficial_injury_of_wrist_hand', 'Fracture_of_shaft_of_tibia', 'Contusion_of_lower_back_and_pelvis', 'Contusion_other_parts_of_foot', 'Sup

## Creating working dataset

In [47]:
# Final df
df_final = df_train_data.copy()

# Date
date_var = Utils.find_date(df_final)

find_date, date_col found: ['DateOfAccident']


#### Count NAs in y by id

In [48]:
df_final.head()

Unnamed: 0,ICD10Description,DateOfAccident,RmaRegionDesc,Product,Sum of PaidDaysValue
0,Amputation_of_finger,1982-10-24,Witbank,IOD Workmans Policy,0.0
1,Amputation_of_finger,1982-10-25,,,0.0
2,Amputation_of_finger,1982-10-26,,,0.0
3,Amputation_of_finger,1982-10-27,,,0.0
4,Amputation_of_finger,1982-10-28,,,0.0


In [49]:
pivotna = pd.pivot_table(df_final[df_final[y].isna()], index=id, values = y, aggfunc='count').reset_index()
pivotna.rename(columns={y: y + '_count_NA'})
pivotna

Unnamed: 0,ICD10Description,Sum of PaidDaysValue
0,Amputation_between_knee_and_ankle,0
1,Amputation_of_finger,0
2,Amputation_of_one_toe,0
3,Amputation_of_two_or_more_fingers,0
4,Barotrauma_Otitic,0
...,...,...
95,Superficial_injury_of_lip_oral_cavity_Open_wou...,0
96,Superficial_injury_of_lower_leg_Open_wound_hea...,0
97,Superficial_injury_of_other_parts_of_head,0
98,Superficial_injury_of_other_parts_of_head_Open...,0


### Adding regressors to final dataframe

#### Holidays

If you don't have specific holiday dataset, you can use the following general function by country that uses the holiday python package and adds to your dataframe a columns with a holiday dummy variable (0/1):

    df_final = Regressors.add_holidays_by_country(df_final, date_var, country = 'France')

In [50]:
df_final = Regressors.add_holidays_by_country(df_final, date_var, country='United States')
print('Min date:', df_final[date_var].min())
print('Max date:', df_final[date_var].max())

SyntaxError: invalid syntax (<string>, line 1)

#### Other calendar variables

In [None]:
df_final = Regressors.add_weekdays(df_final, date_var)
df_final = Regressors.add_months(df_final, date_var)
print('Min date:', df_final[date_var].min())
print('Max date:', df_final[date_var].max())

Min date: 1982-10-24 00:00:00
Max date: 2022-12-31 00:00:00


#### Remove duplicates

In [None]:
df_final = df_final.drop_duplicates()
print('List ids in df_final after removing duplicates:', list(df_final[id].unique()))
assert df_final[df_final.duplicated()].count().sum() == 0, "y should not contain duplicates"
print('Min date:', df_final[date_var].min())
print('Max date:', df_final[date_var].max())

List ids in df_final after removing duplicates: ['Amputation_of_finger', 'Fracture_of_finger', 'Fatal_due_to_Accident_Any_', 'Noise_Induced_Hearing_Loss', 'Open_wound_head', 'Contusion_of_finger_s_thumb', 'Foreign_body_cornea', 'Sprain_and_strain_of_cervical_spine', 'Other_respiratory_conditions', 'Contusion_of_knee', 'Contusion_of_shoulder_upper_arm', 'Sprain_collateral_ligament_of_knee', 'Open_wound_other_parts_of_wrist_hand', 'Superficial_injuries_of_abdomen_lower_back_pelvis', 'Sprain_and_strain_of_ankle', 'Open_wound_other_parts_lower_leg', 'Superficial_injury_of_other_parts_of_head', 'Contusion_of_ankle', 'Open_wound_of_finger_s_thumb', 'Sprain_and_strain_of_lumbar_spine', 'Barotrauma_Otitic', 'Contusion_of_elbow', 'Contusion_other_parts_of_wrist_hand', 'Fracture_of_thumb', 'Open_wound_of_knee', 'Open_wound_of_back_wall_of_thorax', 'Superficial_injury_of_wrist_hand', 'Fracture_of_shaft_of_tibia', 'Contusion_of_lower_back_and_pelvis', 'Contusion_other_parts_of_foot', 'Superficial_

#### Check regressor availability

In [None]:
df_final.columns

Index(['ICD10Description', 'DateOfAccident', 'RmaRegionDesc', 'Product',
       'Sum of PaidDaysValue', 'holidays', 'wd_mon', 'wd_tue', 'wd_wed',
       'wd_thu', 'wd_fri', 'wd_sat', 'wd_sun', 'month_01', 'month_02',
       'month_03', 'month_04', 'month_05', 'month_06', 'month_07', 'month_08',
       'month_09', 'month_10', 'month_11', 'month_12'],
      dtype='object')

In [None]:
# Temperatures have been filled, only temperature asis that is the composition between the actual temperature and ten year averages
regressors_list = [ 'holidays','RmaRegionDesc', 'Product',
       'holidays', 'wd_mon', 'wd_tue', 'wd_wed',
       'wd_thu', 'wd_fri', 'wd_sat', 'wd_sun', 'month_01', 'month_02',
       'month_03', 'month_04', 'month_05', 'month_06', 'month_07', 'month_08',
       'month_09', 'month_10', 'month_11', 'month_12']

try:
       Utils.check_regressors_availability(df_final, date_var, regressors_list, forecast_end_date)
except:
       Utils.remove_regressors_with_nan(df_final, date_var, regressors_list, forecast_end_date)

Regressor holidays has all needed values
Latest filled available date for regressor RmaRegionDesc is 2013-03-12 00:00:00 
 expected is 2022-12-31 00:00:00
Regressor holidays has all needed values
Latest filled available date for regressor RmaRegionDesc is 2013-03-12 00:00:00 
 expected is 2022-12-31 00:00:00
Regressor RmaRegionDesc shows null values <= forecast_end_date. 
 Regressor REMOVED
Latest filled available date for regressor Product is 2013-03-12 00:00:00 
 expected is 2022-12-31 00:00:00
Regressor Product shows null values <= forecast_end_date. 
 Regressor REMOVED
Latest filled available date for regressor Sum of PaidDaysValue is 2022-12-31 00:00:00 
 expected is 2022-12-31 00:00:00
Regressor Sum of PaidDaysValue shows null values <= forecast_end_date. 
 Regressor REMOVED
Regressor holidays has all needed values
Regressor wd_mon has all needed values
Regressor wd_tue has all needed values
Regressor wd_wed has all needed values
Regressor wd_thu has all needed values
Regressor w

# Saving

In [52]:
df_final.to_pickle(os.path.join(root, cfg_path.data_dir.output_path, 'insurance_claims_final.pkl'))

In [53]:
print('Min date:', df_final[date_var].min())
print('Max date:', df_final[date_var].max())
df_final.head()


Min date: 1982-10-24 00:00:00
Max date: 2022-12-31 00:00:00


Unnamed: 0,ICD10Description,DateOfAccident,RmaRegionDesc,Product,Sum of PaidDaysValue
0,Amputation_of_finger,1982-10-24,Witbank,IOD Workmans Policy,0.0
1,Amputation_of_finger,1982-10-25,,,0.0
2,Amputation_of_finger,1982-10-26,,,0.0
3,Amputation_of_finger,1982-10-27,,,0.0
4,Amputation_of_finger,1982-10-28,,,0.0
