In [1]:
import numpy as np
import pandas as pd

from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_percentage_error as mape

In [2]:
data = pd.read_csv('medmal_pos.csv')
data.shape

(3400, 13)

In [29]:
insurances = data.GRNAME.unique()

test_insurances = ['Dentists Ins Co', 'Physicians Recip Insurers', 'Louisiana Med Mut Ins Co', 'Clinic Mut Ins Co RRG', 'California Healthcare Ins Co Inc']
train_insurances = [insurance for insurance in insurances if insurance not in test_insurances]

In [52]:
development_triangles = {}

for insurance_company in insurances:
    insurance_company_filtered_data = data[data.GRNAME == insurance_company].copy()

    temp_data = []
    n_of_accident_year = 0
    for accident_year in insurance_company_filtered_data.AccidentYear.unique():
        year_filtered_df = insurance_company_filtered_data[insurance_company_filtered_data.AccidentYear == accident_year].copy()
        row_df = year_filtered_df[['IncurLoss_F2']].iloc[:].T
        row_df.columns = [f'lag_{k}' for k in list(range(1, row_df.shape[1] + 1))]
        row_df.index = [accident_year]
        temp_data.append(row_df)
        n_of_accident_year += 1

    development_triangles[insurance_company] = pd.concat(temp_data)

In [53]:
X_y_data = {
    'one_step_reg' : {'X': [], 'y': []},
    'two_step_reg' : {'X': [], 'y': []},
    'three_step_reg' : {'X': [], 'y': []},
    'four_step_reg' : {'X': [], 'y': []},
    'five_step_reg' : {'X': [], 'y': []},
    'six_step_reg' : {'X': [], 'y': []},
    'seven_step_reg' : {'X': [], 'y': []},
    'eight_step_reg' : {'X': [], 'y': []},
    'nine_step_reg' : {'X': [], 'y': []},
}

for insurance_company in development_triangles:
    print(insurance_company)
    dt = development_triangles[insurance_company]
    for i in range(1, dt.shape[0]):

        # if (dt.iloc[i, :(10-i)].unique() == np.array([0]))[0]:
        #     continue
        
        X = np.array(dt.iloc[i, :(10-i)])
        y = np.array(dt.iloc[i, (10-i):])

        X_y_data[list(X_y_data.keys())[i-1]]['X'].append(X)
        X_y_data[list(X_y_data.keys())[i-1]]['y'].append(y)

Scpie Indemnity Co
Promutual Grp
Great Amer Grp
Nationwide Grp
Markel Corp Grp
Overseas Partners Us Reins Co
Eastern Dentists Ins Co RRG
American Assoc Of Othodontists RRG
Controlled Risk Ins Co Of VT Inc
Texas Medical Ins Co
MCIC VT Inc RRG
Franklin Cas Ins Co RRG
Homestead Ins Co
Campmed Cas & Ind Co Inc MD
Community Blood Cntr Exch RRG
Underwriters At Lloyds London
NCMIC Ins Co
National American Ins Co
Michigan Professional Ins Exch
Nichido Fire & Marine Ins Co Ltd
Texas Hospital Ins Exch
State Volunteer Mut Ins Co
MHA Ins Co
Health Care Ind Inc
National Guardian RRG Inc
Preferred Professional Ins Co
Medical Mut Ins Co Of ME
Utah Medical Ins Assoc
Seguros Triples Inc
Dentists Ins Co
Physicians Recip Insurers
Louisiana Med Mut Ins Co
Clinic Mut Ins Co RRG
California Healthcare Ins Co Inc


In [54]:
from sklearn.model_selection import cross_val_score, KFold

cv_scores = {
    'one_step_reg' : '',
    'two_step_reg' : '',
    'three_step_reg' : '',
    'four_step_reg' : '',
    'five_step_reg' : '',
    'six_step_reg' : '',
    'seven_step_reg' : '',
    'eight_step_reg' : '',
    'nine_step_reg' : '',
}

for step_reg in X_y_data:
    print(step_reg)
    X, y = X_y_data[step_reg]['X'], X_y_data[step_reg]['y']

    # Train a linear regression model using cross-validation
    multioutputregressor = MultiOutputRegressor(XGBRegressor())
    scores = cross_val_score(multioutputregressor, X, y, cv=KFold(n_splits=5), scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-scores)
    print('Mean RMSE score:', rmse_scores.mean())
    cv_scores[step_reg] = rmse_scores.mean()

one_step_reg
Mean RMSE score: 2635.7355119311005
two_step_reg
Mean RMSE score: 7703.999456666305
three_step_reg
Mean RMSE score: 6278.017105443386
four_step_reg
Mean RMSE score: 7602.737099609297
five_step_reg
Mean RMSE score: 6682.941833426943
six_step_reg
Mean RMSE score: 7191.441733691856
seven_step_reg
Mean RMSE score: 9004.744439428965
eight_step_reg
Mean RMSE score: 7269.959001268027
nine_step_reg
Mean RMSE score: 8459.531422364438


In [58]:
cv_scores_df = pd.DataFrame([cv_scores]).T
cv_scores_df.columns = ['rmse_kfold_cv']

In [59]:
cv_scores_df

Unnamed: 0,rmse_kfold_cv
one_step_reg,2635.735512
two_step_reg,7703.999457
three_step_reg,6278.017105
four_step_reg,7602.7371
five_step_reg,6682.941833
six_step_reg,7191.441734
seven_step_reg,9004.744439
eight_step_reg,7269.959001
nine_step_reg,8459.531422
