In [None]:
import numpy as np
import pandas as pd

from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_percentage_error as mape

In [None]:
data = pd.read_csv('medmal_pos.csv')
data.shape

In [None]:
insurances = data.GRNAME.unique()

test_insurances = ['Dentists Ins Co', 'Physicians Recip Insurers', 'Louisiana Med Mut Ins Co', 'Clinic Mut Ins Co RRG', 'California Healthcare Ins Co Inc']
train_insurances = [insurance for insurance in insurances if insurance not in test_insurances]

In [None]:
development_triangles = {}

for insurance_company in insurances:
    insurance_company_filtered_data = data[data.GRNAME == insurance_company].copy()

    temp_data = []
    n_of_accident_year = 0
    for accident_year in insurance_company_filtered_data.AccidentYear.unique():
        year_filtered_df = insurance_company_filtered_data[insurance_company_filtered_data.AccidentYear == accident_year].copy()
        row_df = year_filtered_df[['IncurLoss_F2']].iloc[:].T
        row_df.columns = [f'lag_{k}' for k in list(range(1, row_df.shape[1] + 1))]
        row_df.index = [accident_year]
        temp_data.append(row_df)
        n_of_accident_year += 1

    development_triangles[insurance_company] = pd.concat(temp_data)

In [None]:
X_y_data = {
    'one_step_reg' : {'X': [], 'y': []},
    'two_step_reg' : {'X': [], 'y': []},
    'three_step_reg' : {'X': [], 'y': []},
    'four_step_reg' : {'X': [], 'y': []},
    'five_step_reg' : {'X': [], 'y': []},
    'six_step_reg' : {'X': [], 'y': []},
    'seven_step_reg' : {'X': [], 'y': []},
    'eight_step_reg' : {'X': [], 'y': []},
    'nine_step_reg' : {'X': [], 'y': []},
}

for insurance_company in development_triangles:
    dt = development_triangles[insurance_company]
    for i in range(1, dt.shape[0]):

        # if (dt.iloc[i, :(10-i)].unique() == np.array([0]))[0]:
        #     continue
        
        X = np.array(dt.iloc[i, :(10-i)])
        y = np.array(dt.iloc[i, (10-i):])

        X_y_data[list(X_y_data.keys())[i-1]]['X'].append(X)
        X_y_data[list(X_y_data.keys())[i-1]]['y'].append(y)

In [None]:
from sklearn.model_selection import cross_val_score, KFold

cv_scores = {
    'one_step_reg' : '',
    'two_step_reg' : '',
    'three_step_reg' : '',
    'four_step_reg' : '',
    'five_step_reg' : '',
    'six_step_reg' : '',
    'seven_step_reg' : '',
    'eight_step_reg' : '',
    'nine_step_reg' : '',
}

for step_reg in X_y_data:
    print(step_reg)
    X, y = X_y_data[step_reg]['X'], X_y_data[step_reg]['y']

    # Train a linear regression model using cross-validation
    multioutputregressor = MultiOutputRegressor(XGBRegressor())
    scores = cross_val_score(multioutputregressor, X, y, cv=KFold(n_splits=5), scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-scores)
    print('Mean RMSE score:', rmse_scores.mean())
    cv_scores[step_reg] = rmse_scores.mean()

In [None]:
cv_scores_df = pd.DataFrame([cv_scores]).T
cv_scores_df.columns = ['rmse_kfold_cv']

In [None]:
cv_scores_df