## Data Preparation, i.e. Scaling and Splitting (Training vs Test Data)

In [1]:
# Scale Data with features to a [-1,+1] range
# option standard: Scale them w.r.t their range
# option conditional: proceed as for option standard with exception of feature age of contract. 
##  age of contract will we scaled relatively to duration

def data_prep_feautures_scale(data, Max_min, option = 'standard', option_1dim = True):

    #### Scale feature components to [-1,+1]
    # Max_min: Row 0 -> Minima of features, Row 1 -> Maxima of features
    data_sc = np.zeros(shape = data.shape)
    if option == 'standard':
        data_sc = 2*(data- Max_min[:,0])/(Max_min[:,1]-Max_min[:,0]) -1
    elif option == 'conditional':
        data_sc[:,0:-1] = 2*(data[:,0:-1]- Max_min[0:-1,0])/(Max_min[0:-1,1]-Max_min[0:-1,0]) -1
        data_sc[:,-1] = 2*data[:,-1]/data[:,-2] -1
    
    if option_1dim ==True:
        data_age_sc = data_sc[:,0]
        data_sum_sc = data_sc[:,1]
        data_dur_sc = data_sc[:,2]

        # Scale w.r.t range (even for option conditional) since a conditional scaling will make the 1-dimensional
        # invalid (since factor this relates to, i.e. duration, not given in projection)
        data_aoc_sc = 2*(data[:,3]- Max_min[3,0])/(Max_min[3,1]-Max_min[3,0]) -1

        return data_sc, data_age_sc, data_sum_sc, data_dur_sc, data_aoc_sc
    else:
        return data_sc

In [6]:
# Reverse the scaling procedure, to have nicely interpretable values

def data_re_transform_features(data_scaled, Max_min, option = 'standard'):
    
    #### Transform feature components from [-1,+1] to their previous range
    # Max_min: Row 0 -> Minima of features, Row 1 -> Maxima of features
    if option == 'standard':
        data_previous = (data_scaled+1)/2*(Max_min[:,1]-Max_min[:,0])+Max_min[:,0]
    elif option == 'conditional':
        data_previous = np.zeros(shape = data_scaled.shape)
        data_previous[:,0:-1] = (data_scaled[:,0:-1]+1)/2*(Max_min[0:-1,1]-Max_min[0:-1,0])+Max_min[0:-1,0]
        # Re_transform 'Age of contract' separately
        data_previous[:,-1] = (data_scaled[:,-1]+1)/2*data_previous[:,-2]
    
    return data_previous

In [8]:
# Scale target values
# We check this approach in our analysis, but drop it due to inefficency and resort in a internal scaling layer.
def data_prep_targets_scale(value, scale_up, scale_low =0, logarithmic = False):
        if logarithmic == False:
            return 2*(value-scale_low)/(scale_up-scale_low)-1
        else:
            return 2*(np.log(1+value)-np.log(1+scale_low))/(np.log(1+scale_up)-np.log(1+scale_low))-1

In [7]:
# Split data w.r.t. a given share/ ratio.
def data_prep_split(data, split_ratio):

    #### Split (raw and scaled) Data in Training and Test Set ####

    N_train = int(split_ratio*N_contracts)       
    
    return data[0:N_train,],data[N_train:,]

In [3]:
# Method to scale 3-dimensional data (which is valid as input to RNN)
# dimensions relate to 'record','timestep', 'feature'

def data_prep_rnn_scale(data_rnn, Max_min, default_value = -5, option = 'standard'):
    
    data_sc = np.copy(data_rnn.astype('double'))
    
    # Scale w.r.t. Age
    data_sc[:,:,0] = 2*(data_sc[:,:,0]-Max_min[0,0])/(Max_min[0,1]-Max_min[0,0])-1
    #contract_matured_age = data_sc[:,:,0]== data_sc[:,:,0].min()
    #data_rnn_sc[contract_matured_age,0] = default_value
    # Scale w.r.t. Sum Insured
    data_sc[:,:,1] = 2*(data_sc[:,:,1]-Max_min[1,0])/(Max_min[1,1]-Max_min[1,0])-1
    #contract_matured_sum = data_sc[:,:,1]== data_sc[:,:,1].min()
    #data_sc[contract_matured_sum,1] = default_value
    # Scale w.r.t. Duration
    data_sc[:,:,2] = 2*(data_sc[:,:,2]-Max_min[2,0])/(Max_min[2,1]-Max_min[2,0])-1
    #contract_matured_dur = data_rnn_sc[:,:,2]== data_sc[:,:,2].min()
    #data_sc[contract_matured_dur,2] = default_value
    # Scale w.r.t. Age of Contract
    if option == 'standard':
        data_sc[:,:,3] = 2*(data_sc[:,:,3]-Max_min[3,0])/(Max_min[3,1]-Max_min[3,0])-1
    elif option == 'conditional':
        data_sc[:,:,3] = 2*(data_rnn[:,:,3])/(data_rnn[:,:,2])-1
    else:
        print('Option unknown.')
        pass
    #contract_matured_aoc = data_sc[:,:,3]== data_sc[:,:,3].min()
    
    # set default value for time point when contract has matured
    contract_matured = (data_rnn[:,:,:]==-1)
    data_sc[contract_matured] = default_value
    
    return data_sc