# PhyloCNN - CI_HIV

This notebook was modified [from (Lambert et al. 2023)](https://github.com/JakubVoz/deeptimelearning/blob/main/estimation/NN/empirical/BISSE_cnn_CDV_mae_CI_computation_Gomez2012.ipynb).

In [None]:
## Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from keras.models import model_from_json

In [None]:
#########loading data#########
encoding_Zurich = pd.read_csv('./Encoded_HIV.csv', sep="\t", header=0, index_col=0).values.reshape(-1,1000,18)

In [None]:
# Additional sampling probability and tree size for rescaling
test_rescale = 5.174108653956917  # Rescaling factor
test_tree_size = 200  # Example tree size

# Add a new column for sampling probability (0.25 for all nodes)
samp_proba_list = np.array(0.25)
encoding_Zurich=np.concatenate((encoding_Zurich,np.repeat(samp_proba_list,1000).reshape(-1,1000,1)),axis=2)

(1, 999, 19)

In [None]:
# This function takes in the tree encodings for the empirical dataset
# and processes it to have a uniform shape. It also pads the leaves and nodes 
# of the tree to ensure a fixed number of 500 leaves and nodes.

def encode_pad_0s_rootage(enc):
    # Create an empty list to hold padded training encodings
    enc_pad = []
    
    # Iterate over each tree in the training dataset
    for i in range(enc.shape[0]):
        # Separate the leaves (where column 3 has value 1, which indicates leaves)
        leaves = enc[i][enc[i,:,3] == 1]
        # Sort leaves by their age (assumed to be in column 1)
        leaves = leaves[np.argsort(leaves[:, 1])]
        # Pad the leaves array with 0s until it has a maximum size of 500 leaves
        leaves = np.pad(leaves, [(0, (500 - leaves.shape[0])), (0, 0)], mode='constant')

        # Separate the nodes (where column 3 is greater than 1, indicating internal nodes)
        nodes = enc[i][enc[i,:,3] > 1]
        # Sort nodes by their age (assumed to be in column 1)
        nodes = nodes[np.argsort(nodes[:, 1])]
        # Copy the last node's value to balance the number of leaves and nodes
        nodes = np.append(nodes, nodes[-1].reshape(1, -1), axis=0)
        # Pad the nodes array with 0s to ensure a size of 500 nodes
        nodes = np.pad(nodes, [(0, (500 - nodes.shape[0])), (0, 0)], mode='constant')
        
        # Stack the leaves and nodes arrays together along axis 2 (creating 2 channels)
        enc_pad.append(np.stack((leaves, nodes), axis=2))
    
    # Convert lists to numpy arrays and return the padded training and test data
    return np.array(enc_pad)

#Change encoding to order by root age and pad with 0s
encoding_pad = encode_pad_0s_rootage(encoding_Zurich)


In [None]:
#load the model
from keras.models import model_from_json
json_file = open('./Trained_Models/Trained_2Generation_BDSS_HIV.json', 'r')
model = json_file.read()
json_file.close()
estimator = model_from_json(model)
#load weights
estimator.load_weights('./Trained_Models/Trained_2Generation_BDSS_HIV.h5')
print('model loaded!')

# predict values for the empirical data
predicted_test = pd.DataFrame(estimator.predict(encoding_pad))
predicted_test[1]=predicted_test[1]*test_rescale # Apply rescaling to the second predicted column (infectious period)

model loaded!


In [None]:
print(predicted_test)

          0         1          2         3
0  1.412436  11.32054  20.390572  0.129327


In [None]:
### load data sets for CI computations
# CI_param: parameter values used to simulate trees in the training set
CI_param = pd.read_csv('./BDSS_HIV.csv', sep=",")
# CI_predicted: predicted parameter values obtained with the training set
CI_predicted = pd.read_csv('./Predictions_BDSS_HIV.csv')

In [None]:
# Define the target parameters for which CI will be calculated
targets = ["R_nought","infectious_period_rescaled","x_transmission","fraction_1"]

# number of neighboring simulation sets we consider to compute CI
n_neighbors = [1000]

# Min/max values for the computed CI values: set to biologically relevant boundaries (i.e. non negative values - between 0 and 1,000)
min_max = {targets[0]: [0,1000], targets[1]: [0,1000], targets[2]: [0,1000], targets[3]: [0,1000]}
# Prepare column names for output table
add_ons_names = ['_CI_2_5', '_CI_97_5', '_CI_width']
col = [add_on + '_' + str(n_neigh) for n_neigh in n_neighbors for add_on in add_ons_names]
col_comp = []
col_comp = [target + co for target in targets for co in col]

In [None]:
# Function to compute the indexes of closest neighbors for a given test value
def get_indexes_of_closest_single_factor(test_value, ci_values, n):
    """Returns indexes of knn for given set

    :param test_value: float, value of parameter (e.g. sampling proba or tree size) on which we select given observation
    :param ci_values: dataframe, values of these parameters in CI set
    :param n: int, number of KNNs to find
    :return: list, indexes of n KNNs
    """
    ref = ci_values.iloc[(ci_values-test_value).abs().argsort()].index
    return [ref[i] for i in range(n)]


def get_indexes_of_closest(test_s, ci_s, n):
    """Returns indexes of knn for given set
    :param test_s: dataframe, param set given observation
    :param ci_s: dataframe, param sets of CI set
    :param n: int, number of KNNs to find
    :return: list, indexes of n KNNs
    """
    ref = ci_s.iloc[(ci_s - test_s.values).pow(2).sum(axis=1).pow(0.5).argsort()].index
    return [ref[i] for i in range(n)]


def get_predicted_closest_single(indexes, pred_value_table, targ):
    """ returns the absolute errors for knn
    :param indexes: list, index of knn
    :param pred_value_table: dataframe, predicted parameter values of CI set
    :param targ: str, parameter name
    :return: list of predictions for each knn
    """
    # subset the real and predicted values of the closest neighbors
    closest_pred = pred_value_table.loc[indexes, :]

    # for single parameter, get the absolute difference between these
    pred_d = list(closest_pred[targ][:])
    return pred_d


def get_error_closest_single(indexes, real_value_table, pred_value_table, targ):
    """ returns the absolute errors for knn
    :param indexes: list, index of knn
    :param real_value_table: dataframe, real/target parameter values of CI set
    :param pred_value_table: dataframe, predicted parameter values of CI set
    :param targ: str, parameter name
    :return: list of absolute error in predictions for each knn
    """
    # subset the real and predicted values of the closest neighbors
    closest_pred = pred_value_table.loc[indexes, :]
    closest_real = real_value_table.loc[indexes, :]

    # for single parameter, get the absolute difference between these
    error_d = closest_pred[targ] - closest_real[targ]
    return error_d


def apply_filter(df1, df2, df3, df4, indexes):
    """Filters the data based on indexes of closest neighbors"""
    return df1.loc[indexes], df2.loc[indexes], df3.loc[indexes], df4.loc[indexes]


def load_files(arg_name, sep=""):
    """Loads given file

    :param arg_name: parser arg, pointer to the file
    :param sep: str, eventual separator
    :return: pd.Dataframe, loaded file
    """
    with open(arg_name, 'r') as des0:
        des_data0 = des0.read()
    des0.close()

    if sep == "":
        output = pd.read_csv(io.StringIO(des_data0), index_col=0, header=None)
    else:
        output = pd.read_csv(io.StringIO(des_data0), index_col=0, header=None, sep=sep)

    return output

In [None]:
### pre processing of datasets used for CI computation: extracting parameters of interest, standardizing them
# extract helper parameters of the CI set
# subset sampling probability:
CI_sampling = CI_param["samp_proba"]
# tree size:
CI_tree_size = CI_param["nb_tips"]

# subselect columns/parameters of interest for each table + all in the same order
CI_param = CI_param[targets]
predicted_test.columns = CI_param.columns
predicted_test = predicted_test[targets]
CI_predicted = CI_predicted[targets]

# before computation, standardize all columns so that each parameter is on the same scale:
scaler = StandardScaler()
CI_param_standardized = pd.DataFrame(scaler.fit_transform(CI_param)) # fit to CI set
predicted_test_standardized = pd.DataFrame(scaler.transform(predicted_test))

# restore column names and index values
CI_param_standardized.columns = CI_param.columns
CI_param_standardized.index = CI_param.index
predicted_test_standardized.columns = predicted_test.columns
predicted_test_standardized.index = predicted_test.index

In [None]:
# initialize the output table
#CI_df = pd.DataFrame(index=range(0, predicted_test.shape[0]), columns=col_comp)
CI_df = pd.DataFrame(index=range(0, predicted_test.shape[0]), columns=col_comp)
# predicted parameter values from empirical set: here there is only one empirical set for which we want to compute CI values
current_obs = predicted_test.iloc[0, :]
current_obs_standardized = predicted_test_standardized.iloc[0, :]

## find the 20% of closest simulations with respect to tree size and sampling frequency
# first filter: keep only the closest 5k CI sets with respect to tree size
tree_size_indexes = get_indexes_of_closest_single_factor(test_tree_size, CI_tree_size, 50000)
filt_1_CI_predicted, filt_1_param_CI_standardized, filt_1_CI_param, filt_1_CI_sampling_proba = \
    apply_filter(CI_predicted, CI_param_standardized, CI_param, CI_sampling, tree_size_indexes)
# reset indexes
filt_1_CI_param.index = filt_1_param_CI_standardized.index = filt_1_CI_predicted.index = \
    filt_1_CI_sampling_proba.index = range(0, 50000)

# second filter: keep only the closest 1k CI sets with respect to sampling frequency
sampling_proba_indexes = get_indexes_of_closest_single_factor(samp_proba_list, filt_1_CI_sampling_proba, 50000)
filt_2_CI_predicted, filt_2_param_CI_standardized, filt_2_CI_param, filt_2_CI_sampling_proba = \
    apply_filter(filt_1_CI_predicted, filt_1_param_CI_standardized, filt_1_CI_param,
                 filt_1_CI_sampling_proba, sampling_proba_indexes)

# reset indexes
filt_2_CI_predicted.index = filt_2_param_CI_standardized.index = filt_2_CI_param.index = range(0, 50000)

# vector to stock all measures of the current observation
all_real = []

for elt in targets:

    # find indexes of closest parameter sets within the predicted values of 40K simulation of CI set
    top_ind = get_indexes_of_closest_single_factor(current_obs_standardized[elt], filt_2_param_CI_standardized[
        elt], n_neighbors[-1])

    # measure errors on closest parameters sets (predicted - actual values)
    pred_closest = get_predicted_closest_single(top_ind, filt_2_CI_predicted, elt)
    error_closest = get_error_closest_single(top_ind, filt_2_CI_param, filt_2_CI_predicted, elt)

    for j in range(len(n_neighbors)):
        # refactor the measured error into a dict 'name_of_param': list of errors (top n neighbours)
        pred_closest_n_neigh = pred_closest[0:n_neighbors[j]]
        error_closest_n_neigh = error_closest[0:n_neighbors[j]]
        median_pred = np.median(pred_closest_n_neigh)
        median_error = np.median(error_closest_n_neigh)
        # center the values around the given prediction
        centered = [item - median_error + current_obs[elt] for item in error_closest_n_neigh]
        # rescale back to original time scale of empirical observation for time-related parameters:
        #if 'resc' in elt:
        #  centered_resc = [float(item * test_rescale) for item in centered]
        #  current_obs[elt] = current_obs[elt] * test_rescale
        #else:
        centered_resc = centered
        # apply minimum and maximum values for each parameter (e.g. no negative values)
        print(elt,centered_resc)
        current_obs[elt] = max(min_max[elt][0], current_obs[elt])
        current_obs[elt] = min(min_max[elt][1], current_obs[elt])
        centered_resc = [max(min_max[elt][0], item) for item in centered_resc]
        centered_resc = [min(min_max[elt][1], item) for item in centered_resc]
        # compute statistics: 2.5%, 97.5% boundaries
        qtls = np.percentile(centered_resc, np.array(np.array([2.5, 97.5])))
        min_2_5 = qtls[0]
        max_97_5 = qtls[1]
        width_CI = qtls[1] - qtls[0]

        all_real.append(min_2_5)
        all_real.append(max_97_5)
        all_real.append(width_CI)

CI_df.loc[0, :] = all_real.copy()

R_nought [1.4653992700347898, 1.38560477003479, 1.36778887003479, 1.3904548700347896, 1.2822620700347898, 1.1800656700347898, 1.17828627003479, 1.57704367003479, 1.5448487700347897, 1.2611334700347898, 1.50077037003479, 1.52222737003479, 1.6961302700347902, 1.3114597700347899, 1.61545897003479, 1.23671607003479, 1.17545097003479, 1.34722167003479, 1.31965967003479, 1.2517875700347898, 1.3195865700347902, 1.58069367003479, 1.1971972700347902, 1.4115152700347902, 1.2760217700347898, 1.4354346700347902, 1.42422547003479, 1.74707347003479, 1.35755727003479, 1.4465098700347898, 1.6227523700347901, 1.1941290700347897, 1.53986737003479, 1.4654738700347898, 1.3341337700347897, 1.5533517700347899, 1.35346857003479, 1.23918997003479, 1.52359657003479, 1.66374567003479, 1.40118857003479, 1.5770034700347901, 1.37121467003479, 1.3083652700347899, 1.2030383700347902, 1.44457317003479, 1.2973692700347899, 1.08998967003479, 1.41080467003479, 1.37814687003479, 1.3316410700347898, 1.52005677003479, 1.62

In [None]:
CI_df = pd.concat([current_obs.to_frame().T, CI_df], axis=1)

In [None]:
#1000 trees with 50,000 training 0.25
CI_df

Unnamed: 0,R_nought,infectious_period_rescaled,x_transmission,fraction_1,R_nought_CI_2_5_1000,R_nought_CI_97_5_1000,R_nought_CI_width_1000,infectious_period_rescaled_CI_2_5_1000,infectious_period_rescaled_CI_97_5_1000,infectious_period_rescaled_CI_width_1000,x_transmission_CI_2_5_1000,x_transmission_CI_97_5_1000,x_transmission_CI_width_1000,fraction_1_CI_2_5_1000,fraction_1_CI_97_5_1000,fraction_1_CI_width_1000
0,1.412436,11.32054,20.390572,0.129327,1.156663,1.739923,0.583261,8.761571,14.303697,5.542126,10.561785,26.227327,15.665543,0.078163,0.168898,0.090735
