### Import libraries

In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse

### Read data

In [None]:
data = pd.read_csv('medmal_pos.csv')
data.shape

### Transform data into development triangles, grouped by insurance company

In [None]:
train_development_triangles = {}

for insurance_company in data.GRNAME.unique():
    insurance_company_filtered_data = data[data.GRNAME == insurance_company].copy()

    temp_data = []
    n_of_accident_year = 0
    for accident_year in insurance_company_filtered_data.AccidentYear.unique():
        year_filtered_df = insurance_company_filtered_data[insurance_company_filtered_data.AccidentYear == accident_year].copy()
        row_df = year_filtered_df[['IncurLoss_F2']].iloc[:10 - n_of_accident_year].T
        row_df.columns = [f'lag_{k}' for k in list(range(1, row_df.shape[1] + 1))]
        row_df.index = [accident_year]
        temp_data.append(row_df)
        n_of_accident_year += 1

    train_development_triangles[insurance_company] = pd.concat(temp_data)

In [None]:
test_development_triangles = {}

for insurance_company in data.GRNAME.unique():
    insurance_company_filtered_data = data[data.GRNAME == insurance_company].copy()
    years = insurance_company_filtered_data.AccidentYear.unique()
    temp_data = []
    for i in range(1, len(years)):
        year_filtered_df = insurance_company_filtered_data[insurance_company_filtered_data.AccidentYear == years[i]].copy()
        row_df = [np.nan] * (10 - i) + year_filtered_df['IncurLoss_F2'].iloc[-i:].tolist()
        temp_data.append(row_df)

    test_development_triangles[insurance_company] = pd.DataFrame(temp_data, columns=[f'lag_{k}' for k in range(1, 11)], index=years[1:])

In [None]:
train_development_triangles['Scpie Indemnity Co']

In [None]:
test_development_triangles['Scpie Indemnity Co']

In [None]:
scores = {
        'one_step_reg' : [],
        'two_step_reg' : [],
        'three_step_reg' : [],
        'four_step_reg' : [],
        'five_step_reg' : [],
        'six_step_reg' : [],
        'seven_step_reg' : [],
        'eight_step_reg' : [],
        'nine_step_reg' : [],
    }

for insurance_company in data.GRNAME.unique():
    train_dt = train_development_triangles[insurance_company]
    train_dt = train_dt.replace(0, 0.001)
    test_dt = test_development_triangles[insurance_company]

    factors = []

    for lag_num in range(1, len(train_dt.columns)):
        factor = train_dt[f'lag_{lag_num+1}'].sum()/train_dt[f'lag_{lag_num}'][:-lag_num].sum()
        factors.append(factor)
    
    dev_period = np.array([(i+1) for i in range(9)])
    model = LinearRegression().fit(dev_period.reshape(-1, 1), factors)
    tail = np.array([(i+10) for i in range(101)])
    tail = np.exp(model.intercept_ + model.coef_ * tail) + 1
    tail_factor = tail.prod()

    aux_copy = train_dt.copy()

    for i, col in enumerate(aux_copy.columns[1:]):
        for j in range(i + 1):
            aux_copy[col].at[aux_copy.index[-1] - j] = factors[i] * aux_copy.iloc[:, list(aux_copy.columns).index(col)-1].at[aux_copy.index[-1] - j]

    aux_copy = np.round(aux_copy)

    pred_dt = pd.DataFrame(np.nan, index=list(range(1989, 1998)), columns=[f'lag_{k}' for k in range(1, 11)])

    for i, col in enumerate(aux_copy.columns[1:]):
        for j in range(i + 1):
            pred_dt[col].at[pred_dt.index[-1] - j] = factors[i] * aux_copy.iloc[:, list(aux_copy.columns).index(col)-1].at[aux_copy.index[-1] - j]

    pred_dt = np.round(pred_dt)

    for i in range(9):
        pred_vals = np.array(pred_dt.iloc[i, (10-(i+1)):])
        test_vals = np.array(test_dt.iloc[i, (10-(i+1)):])
        scores[list(scores.keys())[i]].append(mse(test_vals, pred_vals, squared=False))

In [None]:
for step_reg in scores:
    scores[step_reg] = np.mean(scores[step_reg])

In [None]:
scores_df = pd.DataFrame([scores]).T
scores_df.columns = ['rmse']

In [None]:
scores_df