# Implementation

## Packages

In [None]:
# data elaboration functions
import pandas as pd
import string
import numpy as np

# datetime functions
import datetime as dt

# file management functions
import os
import sys
import opendatasets as od
import pickle
from pathlib import Path

# plot functions
import matplotlib.pyplot as plt
%matplotlib inline

# data science functions
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import joblib
from sklearn.metrics import mean_absolute_error

# configuration file
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from Configuration.config import cfg_path

# custom functions
from Code.Plotting.plots import Plots
from Code.Regressors.regressors import Regressors
from Code.Regressors.temperatures import Temperatures
from Code.Scoring.scoring import Scoring
from Code.Scoring.train_test import TrainTest
from Code.Scoring.train import Training
from Code.Scoring.forecast import Forecasting
from Code.Scoring.kpi import Kpi
from Code.Scoring.scoring import Scoring
from Code.Utils.utils import Utils


## Setup

In [None]:
root = Path(os.getcwd()).parent
dataset_path = os.path.join(root, cfg_path.data_dir.input_path)

## Load Data


In [None]:
df_train_data = pd.read_csv(os.path.join(
    root, cfg_path.data_dir.input_path, 'insurance-claims.csv'))
df_train_data.head()

# Data Preparation


## Parameter setup

In [None]:
id = 'ICD10Description'
list_unique_id = ['ICD10Description', 'DateOfAccident']
list_temp = []
y = 'Sum of PaidDaysValue'

#### Setting date as datetime

In [None]:
df_train_data['DateOfAccident'] = pd.to_datetime(df_train_data['DateOfAccident'], format = '%d-%m-%y %H:%M:%S %p')

#### Setting forecast end date

In [None]:
# Make sure to have all regressors available until forecast_end_date (temperatures, etc)
forecast_end_date = '2022-12-31'

## Plotting y series

In [None]:
# Print available ids and choose which one to plot 
print(list(df_train_data[id].unique())[0:20])

In [None]:
# Adjusting id names by removing special characters
import re
df_train_data.loc[:, id] = df_train_data.loc[:, id].apply(lambda x: re.sub('[^A-Za-z0-9]+', '_', x))


In [None]:
# Selecting 100 ids to plot
list_ids_to_plot = list(df_train_data[id].unique()[0:100])

In [None]:
count = 1
for i in list_ids_to_plot:
    print('Plotting id:', i, 'as', count, 'of', len(list_ids_to_plot))
    plot = Plots.sliding_line_plot(df_train_data, y, id, i, chart_title="")
    plot.write_html(os.path.join(root, cfg_path.data_dir.plot_path, id + '_' + str(i) + ".html"))
    count = count + 1 

## Dealing with NAs and aggregating at a chosen frequency

Create a full time sequence on a chosen frequency and aggregate

#### Consumption data (y)

In [None]:
# Selecting 100 ids to elaborate
df_train_data = df_train_data.loc[df_train_data[id].isin(list_ids_to_plot), ]
date_var = Utils.find_date(df_train_data)
print('List ids:', list_ids_to_plot)
len(list_ids_to_plot)

In [None]:
# Resampling function aggregates data in a dataframe with a chosen function, that can vary depending on the variable
# i.e. temperatures when aggregated should be averaged, consumption should be summed, dummy variables should be pick as 'first'

df_train_data[date_var].apply(lambda x: x.tz_localize(None))
sampling = dt.timedelta(days=1)
dict_grouping = {'RmaRegionDesc': 'first', 'Product': 'first', 'Sum of PaidDaysValue': 'sum'}
df_resampled = Utils.resample_data(df_train_data, id, date_var, sampling, dict_grouping)
print('List ids after resampling:', list(df_resampled[id].unique()))

In [None]:
# Adding a full time sequence
df_train_data = Utils.add_seq(df_resampled, date_var, serie = id, freq = sampling, end_date=forecast_end_date, start_date='')

In [None]:
# This function count the number of obs you should have if you had a full time sequence
Utils.check_length_time_serie(df_train_data, date_var, index = id).head()

In [None]:
df_train_data.head()

In [None]:
print('List ids after resampling and adding full time sequence:', list(df_train_data[id].unique()))

## Creating working dataset

In [None]:
# Final df
df_final = df_train_data.copy()

# Date
date_var = Utils.find_date(df_final)

#### Count NAs in y by id

In [None]:
df_final.head()

In [None]:
pivotna = pd.pivot_table(df_final[df_final[y].isna()], index=id, values = y, aggfunc='count').reset_index()
pivotna.rename(columns={y: y + '_count_NA'})
pivotna

### Adding regressors to final dataframe

#### Holidays

If you don't have specific holiday dataset, you can use the following general function by country that uses the holiday python package and adds to your dataframe a columns with a holiday dummy variable (0/1):

    df_final = Regressors.add_holidays_by_country(df_final, date_var, country = 'France')

In [None]:
df_final = Regressors.add_holidays_by_country(df_final, date_var, country='United States')
print('Min date:', df_final[date_var].min())
print('Max date:', df_final[date_var].max())

#### Other calendar variables

In [None]:
df_final = Regressors.add_weekdays(df_final, date_var)
df_final = Regressors.add_months(df_final, date_var)
print('Min date:', df_final[date_var].min())
print('Max date:', df_final[date_var].max())

#### Remove duplicates

In [None]:
df_final = df_final.drop_duplicates()
print('List ids in df_final after removing duplicates:', list(df_final[id].unique()))
assert df_final[df_final.duplicated()].count().sum() == 0, "y should not contain duplicates"
print('Min date:', df_final[date_var].min())
print('Max date:', df_final[date_var].max())

#### Check regressor availability

In [None]:
df_final.columns

In [None]:
# Temperatures have been filled, only temperature asis that is the composition between the actual temperature and ten year averages
regressors_list = [ 'holidays','RmaRegionDesc', 'Product',
       'holidays', 'wd_mon', 'wd_tue', 'wd_wed',
       'wd_thu', 'wd_fri', 'wd_sat', 'wd_sun', 'month_01', 'month_02',
       'month_03', 'month_04', 'month_05', 'month_06', 'month_07', 'month_08',
       'month_09', 'month_10', 'month_11', 'month_12']

try:
       Utils.check_regressors_availability(df_final, date_var, regressors_list, forecast_end_date)
except:
       Utils.remove_regressors_with_nan(df_final, date_var, regressors_list, forecast_end_date)

# Saving

In [None]:
df_final.to_pickle(os.path.join(root, cfg_path.data_dir.output_path, 'insurance_claims_final.pkl'))

In [None]:
print('Min date:', df_final[date_var].min())
print('Max date:', df_final[date_var].max())
df_final.head()
