In [None]:
#Load required libraries
import pandas as pd

import tensorflow as tf
import keras
import numpy as np

from keras.models import Sequential, Model
from keras.layers import Activation, Dense
from keras.layers import Conv2D, GlobalAveragePooling2D, BatchNormalization
from keras.layers import Dense, Dropout
from keras.layers import Dense, Dropout, Activation, Flatten

In [None]:
# loading parameters
param_train = pd.read_csv('./parameters_BDSS.txt', sep='\t')
param_test = pd.read_csv('./testset/parameters_BDSS.txt', sep='\t')

# loading tree encoding
encoding = pd.read_csv('./Encoded_trees_BDSS.csv', sep="\t", header=0, index_col=0).values.reshape(-1,1000,18)
encoding_test = pd.read_csv('./testset/Encoded_trees_BDSS.csv', sep="\t", header=0, index_col=0).values.reshape(-1,1000,18)

In [None]:
#Reshape parameters (and rescale them) and encodings:

### TRAINING SET: PARAMETER VALUES
# extract rescaling factor (last line of each encoded tree)
param_train['resc_factor'] = encoding[:,-1,-1]

# rescale target values according to scaling factor
param_train['infectious_period'] = param_train['infectious_period']/param_train['resc_factor']

### TESTING SET: PARAMETER VALUES
# extract rescaling factor (last line of each encoded tree)
param_test['resc_factor'] = encoding_test[:,-1,-1]
# rescale target values
param_test['infectious_period'] = param_test['infectious_period']/param_test['resc_factor']

# remove irrelevant columns: rescaling factor
encoding=np.delete(encoding, -1, axis=1)
encoding_test=np.delete(encoding_test, -1, axis=1)

#Choice of the parameters to predict
target_1 = "R_nought"
target_2 = "infectious_period"
target_3 = "x_transmission"
target_4 = "fraction_1"

#Add parameters to predict as labels for each simulation
Y = pd.DataFrame(param_train[[target_1, target_2, target_3, target_4]])
Y_test = pd.DataFrame(param_test[[target_1, target_2, target_3, target_4]])

#Fraction of the training trees to be used as validation and training set
valid_frac = 0.3
train_size_frac = 0.7

In [None]:
#Now insert an additional column with sampling proba for all nodes

samp_proba_list = np.array(param_train['sampling_proba'])
encoding=np.concatenate((encoding,np.repeat(samp_proba_list,999).reshape(-1,999,1)),axis=2)

samp_proba_list_test = np.array(param_test['sampling_proba'])
encoding_test=np.concatenate((encoding_test,np.repeat(samp_proba_list_test,999).reshape(-1,999,1)),axis=2)

In [None]:
# This function takes in the tree encodings for both training and testing datasets
# and processes them to have a uniform shape. It also pads the leaves and nodes 
# of the trees to ensure each tree has a fixed number of 500 leaves and nodes.

def encode_pad_0s_rootage(enc, enc_test):
    # Create an empty list to hold padded training encodings
    enc_pad = []
    
    # Iterate over each tree in the training dataset
    for i in range(enc.shape[0]):
        # Separate the leaves (where column 3 has value 1, which indicates leaves)
        leaves = enc[i][enc[i,:,3] == 1]
        # Sort leaves by their age (assumed to be in column 1)
        leaves = leaves[np.argsort(leaves[:, 1])]
        # Pad the leaves array with 0s until it has a maximum size of 500 leaves
        leaves = np.pad(leaves, [(0, (500 - leaves.shape[0])), (0, 0)], mode='constant')

        # Separate the nodes (where column 3 is greater than 1, indicating internal nodes)
        nodes = enc[i][enc[i,:,3] > 1]
        # Sort nodes by their age (assumed to be in column 1)
        nodes = nodes[np.argsort(nodes[:, 1])]
        # Copy the last node's value to balance the number of leaves and nodes
        nodes = np.append(nodes, nodes[-1].reshape(1, -1), axis=0)
        # Pad the nodes array with 0s to ensure a size of 500 nodes
        nodes = np.pad(nodes, [(0, (500 - nodes.shape[0])), (0, 0)], mode='constant')
        
        # Stack the leaves and nodes arrays together along axis 2 (creating 2 channels)
        enc_pad.append(np.stack((leaves, nodes), axis=2))
    
    # Now process the test dataset (same procedure as above)
    enc_pad_test = []
    for i in range(enc_test.shape[0]):
        # Extract and sort leaves
        leaves = enc_test[i][enc_test[i,:,3] == 1]
        leaves = leaves[np.argsort(leaves[:, 1])]
        # Pad leaves to ensure size of 500
        leaves = np.pad(leaves, [(0, (500 - leaves.shape[0])), (0, 0)], mode='constant')

        # Extract and sort nodes
        nodes = enc_test[i][enc_test[i,:,3] > 1]
        nodes = nodes[np.argsort(nodes[:, 1])]
        # Copy the last node's value to balance the number of leaves and nodes
        nodes = np.append(nodes, nodes[-1].reshape(1, -1), axis=0)
        # Pad nodes to ensure size of 500
        nodes = np.pad(nodes, [(0, (500 - nodes.shape[0])), (0, 0)], mode='constant')
        
        # Stack the leaves and nodes arrays together along axis 2 (creating 2 channels)
        enc_pad_test.append(np.stack((leaves, nodes), axis=2))
    
    # Convert lists to numpy arrays and return the padded training and test data
    return np.array(enc_pad), np.array(enc_pad_test)


#Change encoding to order by root age and pad with 0s
encoding_pad, encoding_pad_test = encode_pad_0s_rootage(encoding, encoding_test)

In [None]:
# Creation of the Network Model: model definition
def build_model():
    # Initialize the Sequential model
    model = Sequential()
    
    # First convolutional layer: 
    # - Filters: 32 
    # - Kernel size: (1, 19), sliding across the second dimension of the input 
    # - Input shape: (500, 19, 2) where 500 is the number of tree leaves/nodes, 19 is the feature size, and 2 is the number of channels (leaves and nodes)
    # - Activation function: ELU (Exponential Linear Unit)
    # - Groups: 2 to apply separate convolutions for the two channels (leaves and nodes)
    model.add(Conv2D(filters=32, use_bias=False, kernel_size=(1, 19), input_shape=(500, 19, 2), activation='elu', groups=2))
    
    # Apply batch normalization to stabilize and speed up the training process
    model.add(BatchNormalization())
    
    # Second convolutional layer: 
    # - Filters: 32
    # - Kernel size: (1, 1) to process each feature independently
    # - Activation function: ELU
    model.add(Conv2D(filters=32, use_bias=False, kernel_size=(1, 1), activation='elu'))
    
    # Apply batch normalization again
    model.add(BatchNormalization())
    
    # Third convolutional layer: 
    # - Filters: 32
    # - Kernel size: (1, 1) for further feature processing
    # - Activation function: ELU
    model.add(Conv2D(filters=32, use_bias=False, kernel_size=(1, 1), activation='elu'))
    
    # Apply batch normalization for the final time before flattening
    model.add(BatchNormalization())
    
    # Flatten the 2D feature maps from the convolutional layers into a 1D vector, 
    # which will be passed to the fully connected (dense) layers
    model.add(GlobalAveragePooling2D())
    
    # Fully connected (FFNN) part:
    # Dense layers with decreasing number of units, all using ELU activation:
    model.add(Dense(64, activation='elu'))   # First dense layer with 64 units
    model.add(Dense(32, activation='elu'))   # Second dense layer with 32 units
    model.add(Dense(16, activation='elu'))   # Third dense layer with 16 units
    model.add(Dense(8, activation='elu'))    # Fourth dense layer with 8 units
    
    # Output layer: 
    # - 4 output neurons, corresponding to the 4 target parameters to predict
    # - Activation function: ELU
    model.add(Dense(4, activation='elu'))
    
    # Show the summary of the model structure (number of layers, shapes of outputs, etc.)
    model.summary()

    # Return the constructed model
    return model


In [None]:
from keras import losses

# Initialize the model using the build_model function that was previously defined
estimator = build_model()

# Compile the model:
# - Loss function: 'mean_absolute_percentage_error' (MAPE) is used to measure the error in terms of percentage for regression problems
# - Optimizer: 'Adam' is used to minimize the loss function efficiently
# - Metrics: 'mae' (mean absolute error) is used to track the model's performance during training
estimator.compile(loss=losses.mean_absolute_percentage_error, optimizer='Adam', metrics=['mae'])

# Early stopping callback to prevent overfitting:
# - monitor: monitor the validation loss during training
# - patience: stop training if the validation loss doesn't improve for 100 consecutive epochs
# - mode: 'min' indicates that training will stop when the validation loss reaches its minimum
# - restore_best_weights: restore the weights from the best epoch (the one with the lowest validation loss)
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=100, mode='min', restore_best_weights=True)

# Custom callback to display training progress:
# - Print a dot for every epoch (or newline every 100 epochs) to indicate progress in training
class PrintD(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0:  # Print a newline every 100 epochs
            print('')
        print('.', end='')  # Print a dot to indicate progress during each epoch

# Set the maximum number of epochs (iterations over the entire dataset)
EPOCHS = 1000

# Train the model using the `fit` method:
# - encoding_pad: The padded training data (inputs)
# - Y: The target values (outputs)
# - verbose: set to 1 to print progress during training
# - epochs: The number of times to iterate over the entire dataset
# - validation_split: the fraction of data to use for validation (used to monitor validation loss)
# - batch_size: the number of samples per gradient update
# - callbacks: list of callbacks to be used during training (early stopping and progress display)
history = estimator.fit(encoding_pad, Y, verbose=1, epochs=EPOCHS, validation_split=valid_frac, batch_size=1024, callbacks=[early_stop, PrintD()])

# Save the model architecture to a JSON file:
# - The model structure (architecture) is saved as a JSON string
from keras.models import model_from_json
model = estimator.to_json()
with open('./Trained_Models/Trained_2Generation_BDSS.json', 'w') as json_file:
    json_file.write(model)

# Save the model weights to an H5 file:
# - The weights (learned parameters) of the trained model are saved to a file
estimator.save_weights('./Trained_Models/Trained_2Generation_BDSS.h5')

# Print a confirmation message when the model and weights are saved
print('model saved!')

In [None]:
#load the model
from keras.models import model_from_json
json_file = open('./Trained_Models/Trained_2Generation_BDSS.json', 'r')
model = json_file.read()
json_file.close()
estimator = model_from_json(model)
#load weights
estimator.load_weights('./Trained_Models/Trained_2Generation_BDSS.h5')
print('model loaded!')

# predict values for the test set
predicted_test = pd.DataFrame(estimator.predict(encoding_pad_test))
predicted_test.columns = Y_test.columns # rename correctly the columns
predicted_test.index = Y_test.index # rename indexes for correspondence

In [None]:
####Save the target and predicted values in dataframes

#Target
Y_test_rescaled=pd.DataFrame(Y_test)
Y_test_rescaled['infectious_period'] = Y_test_rescaled['infectious_period']*param_test['resc_factor']
Y_test_rescaled.to_csv('./Predictions/BDSS_target.csv', header=True)

#Predicted
predicted_test['infectious_period'] = predicted_test['infectious_period']*param_test['resc_factor']
predicted_test.to_csv('./Predictions/2Generation_BDSS_predicted.csv', header=True)

In [35]:
#Now I will remove the 2nd generation context to compare the networks
encoding_1gen = encoding_pad[:,:,[0,1,2,3,4,5,6,7,8,18],:]
encoding_test_1gen = encoding_pad_test[:,:,[0,1,2,3,4,5,6,7,8,18],:]

In [38]:
def build_model():
    # Initialize the Sequential model
    model = Sequential()
    
    # First convolutional layer: 
    # - Filters: 32 
    # - Kernel size: (1, 10), sliding across the second dimension of the input 
    # - Input shape: (500, 10, 2) where 500 is the number of tree leaves/nodes, 10 is the feature size, and 2 is the number of channels (leaves and nodes)
    # - Activation function: ELU (Exponential Linear Unit)
    # - Groups: 2 to apply separate convolutions for the two channels (leaves and nodes)
    model.add(Conv2D(filters=32, use_bias=False, kernel_size=(1, 10), input_shape=(500, 10, 2), activation='elu', groups=2))
    
    # Apply batch normalization to stabilize and speed up the training process
    model.add(BatchNormalization())
    
    # Second convolutional layer: 
    # - Filters: 32
    # - Kernel size: (1, 1) to process each feature independently
    # - Activation function: ELU
    model.add(Conv2D(filters=32, use_bias=False, kernel_size=(1, 1), activation='elu'))
    
    # Apply batch normalization again
    model.add(BatchNormalization())
    
    # Third convolutional layer: 
    # - Filters: 32
    # - Kernel size: (1, 1) for further feature processing
    # - Activation function: ELU
    model.add(Conv2D(filters=32, use_bias=False, kernel_size=(1, 1), activation='elu'))
    
    # Apply batch normalization for the final time before flattening
    model.add(BatchNormalization())
    
    # Flatten the 2D feature maps from the convolutional layers into a 1D vector, 
    # which will be passed to the fully connected (dense) layers
    model.add(GlobalAveragePooling2D())
    
    # Fully connected (FFNN) part:
    # Dense layers with decreasing number of units, all using ELU activation:
    model.add(Dense(64, activation='elu'))   # First dense layer with 64 units
    model.add(Dense(32, activation='elu'))   # Second dense layer with 32 units
    model.add(Dense(16, activation='elu'))   # Third dense layer with 16 units
    model.add(Dense(8, activation='elu'))    # Fourth dense layer with 8 units
    
    # Output layer: 
    # - 4 output neurons, corresponding to the 4 target parameters to predict
    # - Activation function: ELU
    model.add(Dense(4, activation='elu'))
    
    # Show the summary of the model structure (number of layers, shapes of outputs, etc.)
    model.summary()

    # Return the constructed model
    return model

In [None]:
from keras import losses

# Initialize the model using the build_model function that was previously defined
estimator = build_model()

# Compile the model:
# - Loss function: 'mean_absolute_percentage_error' (MAPE) is used to measure the error in terms of percentage for regression problems
# - Optimizer: 'Adam' is used to minimize the loss function efficiently
# - Metrics: 'mae' (mean absolute error) is used to track the model's performance during training
estimator.compile(loss=losses.mean_absolute_percentage_error, optimizer='Adam', metrics=['mae'])

# Early stopping callback to prevent overfitting:
# - monitor: monitor the validation loss during training
# - patience: stop training if the validation loss doesn't improve for 100 consecutive epochs
# - mode: 'min' indicates that training will stop when the validation loss reaches its minimum
# - restore_best_weights: restore the weights from the best epoch (the one with the lowest validation loss)
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=100, mode='min', restore_best_weights=True)

# Custom callback to display training progress:
# - Print a dot for every epoch (or newline every 100 epochs) to indicate progress in training
class PrintD(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0:  # Print a newline every 100 epochs
            print('')
        print('.', end='')  # Print a dot to indicate progress during each epoch

# Set the maximum number of epochs (iterations over the entire dataset)
EPOCHS = 1000

# Train the model using the `fit` method:
# - encoding_pad: The padded training data (inputs)
# - Y: The target values (outputs)
# - verbose: set to 1 to print progress during training
# - epochs: The number of times to iterate over the entire dataset
# - validation_split: the fraction of data to use for validation (used to monitor validation loss)
# - batch_size: the number of samples per gradient update
# - callbacks: list of callbacks to be used during training (early stopping and progress display)
history = estimator.fit(encoding_1gen, Y, verbose=1, epochs=EPOCHS, validation_split=valid_frac, batch_size=1024, callbacks=[early_stop, PrintD()])

# Save the model architecture to a JSON file:
# - The model structure (architecture) is saved as a JSON string
from keras.models import model_from_json
model = estimator.to_json()
with open('./Trained_Models/Trained_1Generation_BDSS.json', 'w') as json_file:
    json_file.write(model)

# Save the model weights to an H5 file:
# - The weights (learned parameters) of the trained model are saved to a file
estimator.save_weights('./Trained_Models/Trained_1Generation_BDSS.h5')

# Print a confirmation message when the model and weights are saved
print('model saved!')

In [None]:
#load the model
from keras.models import model_from_json
json_file = open('./Trained_Models/Trained_1Generation_BDSS.json', 'r')
model = json_file.read()
json_file.close()
estimator = model_from_json(model)
#load weights
estimator.load_weights('./Trained_Models/Trained_1Generation_BDSS.h5')
print('model loaded!')

# predict values for the test set
predicted_test = pd.DataFrame(estimator.predict(encoding_test_1gen))
predicted_test.columns = Y_test.columns # rename correctly the columns
predicted_test.index = Y_test.index # rename indexes for correspondence

In [42]:
####Save the target and predicted values in dataframes

#Target
Y_test_rescaled=pd.DataFrame(Y_test)
Y_test_rescaled['infectious_period'] = Y_test_rescaled['infectious_period']*param_test['resc_factor']
Y_test_rescaled.to_csv('./Predictions/BDSS_target.csv', header=True)

#Predicted
predicted_test['infectious_period'] = predicted_test['infectious_period']*param_test['resc_factor']
predicted_test.to_csv('./Predictions/1Generation_BDSS_predicted.csv', header=True)

In [43]:
#Now I will remove all context to compare the networks
encoding_NoContext = encoding_pad[:,:,[0,1,2,3,18],:]
encoding_test_NoContext = encoding_pad_test[:,:,[0,1,2,3,18],:]

In [46]:
def build_model():
    # Initialize the Sequential model
    model = Sequential()
    
    # First convolutional layer: 
    # - Filters: 32 
    # - Kernel size: (1, 5), sliding across the second dimension of the input 
    # - Input shape: (500, 5, 2) where 500 is the number of tree leaves/nodes, 5 is the feature size, and 2 is the number of channels (leaves and nodes)
    # - Activation function: ELU (Exponential Linear Unit)
    # - Groups: 2 to apply separate convolutions for the two channels (leaves and nodes)
    model.add(Conv2D(filters=32, use_bias=False, kernel_size=(1, 5), input_shape=(500, 5, 2), activation='elu', groups=2))
    
    # Apply batch normalization to stabilize and speed up the training process
    model.add(BatchNormalization())
    
    # Second convolutional layer: 
    # - Filters: 32
    # - Kernel size: (1, 1) to process each feature independently
    # - Activation function: ELU
    model.add(Conv2D(filters=32, use_bias=False, kernel_size=(1, 1), activation='elu'))
    
    # Apply batch normalization again
    model.add(BatchNormalization())
    
    # Third convolutional layer: 
    # - Filters: 32
    # - Kernel size: (1, 1) for further feature processing
    # - Activation function: ELU
    model.add(Conv2D(filters=32, use_bias=False, kernel_size=(1, 1), activation='elu'))
    
    # Apply batch normalization for the final time before flattening
    model.add(BatchNormalization())
    
    # Flatten the 2D feature maps from the convolutional layers into a 1D vector, 
    # which will be passed to the fully connected (dense) layers
    model.add(GlobalAveragePooling2D())
    
    # Fully connected (FFNN) part:
    # Dense layers with decreasing number of units, all using ELU activation:
    model.add(Dense(64, activation='elu'))   # First dense layer with 64 units
    model.add(Dense(32, activation='elu'))   # Second dense layer with 32 units
    model.add(Dense(16, activation='elu'))   # Third dense layer with 16 units
    model.add(Dense(8, activation='elu'))    # Fourth dense layer with 8 units
    
    # Output layer: 
    # - 4 output neurons, corresponding to the 4 target parameters to predict
    # - Activation function: ELU
    model.add(Dense(4, activation='elu'))
    
    # Show the summary of the model structure (number of layers, shapes of outputs, etc.)
    model.summary()

    # Return the constructed model
    return model

In [None]:
from keras import losses

# Initialize the model using the build_model function that was previously defined
estimator = build_model()

# Compile the model:
# - Loss function: 'mean_absolute_percentage_error' (MAPE) is used to measure the error in terms of percentage for regression problems
# - Optimizer: 'Adam' is used to minimize the loss function efficiently
# - Metrics: 'mae' (mean absolute error) is used to track the model's performance during training
estimator.compile(loss=losses.mean_absolute_percentage_error, optimizer='Adam', metrics=['mae'])

# Early stopping callback to prevent overfitting:
# - monitor: monitor the validation loss during training
# - patience: stop training if the validation loss doesn't improve for 100 consecutive epochs
# - mode: 'min' indicates that training will stop when the validation loss reaches its minimum
# - restore_best_weights: restore the weights from the best epoch (the one with the lowest validation loss)
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=100, mode='min', restore_best_weights=True)

# Custom callback to display training progress:
# - Print a dot for every epoch (or newline every 100 epochs) to indicate progress in training
class PrintD(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0:  # Print a newline every 100 epochs
            print('')
        print('.', end='')  # Print a dot to indicate progress during each epoch

# Set the maximum number of epochs (iterations over the entire dataset)
EPOCHS = 1000

# Train the model using the `fit` method:
# - encoding_pad: The padded training data (inputs)
# - Y: The target values (outputs)
# - verbose: set to 1 to print progress during training
# - epochs: The number of times to iterate over the entire dataset
# - validation_split: the fraction of data to use for validation (used to monitor validation loss)
# - batch_size: the number of samples per gradient update
# - callbacks: list of callbacks to be used during training (early stopping and progress display)
history = estimator.fit(encoding_NoContext, Y, verbose=1, epochs=EPOCHS, validation_split=valid_frac, batch_size=1024, callbacks=[early_stop, PrintD()])

# Save the model architecture to a JSON file:
# - The model structure (architecture) is saved as a JSON string
from keras.models import model_from_json
model = estimator.to_json()
with open('./Trained_Models/Trained_NoContext_BDSS.json', 'w') as json_file:
    json_file.write(model)

# Save the model weights to an H5 file:
# - The weights (learned parameters) of the trained model are saved to a file
estimator.save_weights('./Trained_Models/Trained_NoContext_BDSS.h5')

# Print a confirmation message when the model and weights are saved
print('model saved!')

In [None]:
#load the model
from keras.models import model_from_json
json_file = open('./Trained_Models/Trained_NoContext_BDSS.json', 'r')
model = json_file.read()
json_file.close()
estimator = model_from_json(model)
#load weights
estimator.load_weights('./Trained_Models/Trained_NoContext_BDSS.h5')
print('model loaded!')

# predict values for the test set
predicted_test = pd.DataFrame(estimator.predict(encoding_test_NoContext))
predicted_test.columns = Y_test.columns # rename correctly the columns
predicted_test.index = Y_test.index # rename indexes for correspondence

In [51]:
####Save the target and predicted values in dataframes

#Target
Y_test_rescaled=pd.DataFrame(Y_test)
Y_test_rescaled['infectious_period'] = Y_test_rescaled['infectious_period']*param_test['resc_factor']
Y_test_rescaled.to_csv('./Predictions/BDSS_target.csv', header=True)

#Predicted
predicted_test['infectious_period'] = predicted_test['infectious_period']*param_test['resc_factor']
predicted_test.to_csv('./Predictions/NoContext_BDSS_predicted.csv', header=True)