In [1]:
# import all required libraries
import sys, os
import numpy as np
import pandas as pd
import random
from random import shuffle, choice
import time
import os
import glob
import keras
import tensorflow as tf
from keras.utils import np_utils
from keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.models import load_model
from tensorflow.keras import regularizers
from random import shuffle, choice
from sklearn.preprocessing import MinMaxScaler
import sklearn.metrics as metrics
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint
from keras.models import Model
from sklearn.preprocessing import MinMaxScaler,StandardScaler

# define a function to build MLP for the trait data.    
def create_mlp(traitstrain, regularizer=None):
  model = Sequential()
  # first layer, remember to remove bias if you are intercalating with batch normalization. ReLu is the activation (nonlinear) function.
  model.add(Dense(150, use_bias=False, input_dim=traitstrain.shape[1], activation="relu", kernel_regularizer=regularizers.l1(0.001)))
  # batch normalization.
  model.add(BatchNormalization())
  # second layer.
  model.add(Dense(150, use_bias=False, activation="relu", kernel_regularizer=regularizers.l1(0.001)))
  model.add(BatchNormalization())
  # third layer.
  model.add(Dense(50, activation="relu", kernel_regularizer=regularizers.l1(0.001)))
  return model

# define a function to build a CNN for the SNP data. 
def create_cnn(xtest, regularizer=None):
  # obtain the input dimensions.
  inputShape = (xtest.shape[1], xtest.shape[2])
  inputs = Input(shape=inputShape)
  x = inputs
  # first convolutional layer, remember to remove bias if you are intercalating with batch normalization.
  x = Conv1D(250, kernel_size=3, activation='relu', use_bias=False, input_shape=(xtest.shape[1], xtest.shape[2]))(x)
  # batch normalization.
  x = BatchNormalization()(x)
  # second layer.
  x = Conv1D(250, kernel_size=3, use_bias=False, activation='relu')(x)
  x = BatchNormalization()(x)
  # third layer.
  x = Conv1D(250, kernel_size=3, use_bias=False, activation='relu')(x)
  x = BatchNormalization()(x)
  # pool the CNN outputs.
  x = MaxPooling1D(pool_size=3)(x)
  # flatten in a single vector.
  x = Flatten()(x)
  # this part is similar to the MLP, a fully connected neural network. We intercalated with dropout to reduce overfitting.
  x = Dense(125, activation='relu')(x)
  # dropout.
  x = Dropout(0.5)(x)
  # second layer of the fully connected neural network.
  x = Dense(125, activation='relu')(x)
  x = Dropout(0.5)(x)
  # third layer of the fully connected neural network. This one matches the number of nodes coming out of the MLP.
  x = Dense(50, kernel_regularizer=regularizer)(x)
  x = Activation("relu")(x)
  # Construct the CNN
  model = Model(inputs, x)
  # Return the CNN
  return model

# define a function to combine the outputs of the MLP and the CNN.
# this was obtained from: https://towardsdatascience.com/neural-networks-ensemble-33f33bea7df3
class LinearW(Layer):
    def __init__(self):
        super(LinearW, self).__init__()    
    def build(self, input_shape):
        self.W = self.add_weight(name='name',
                    shape=(1,1,len(input_shape)),
                    initializer='uniform',
                    dtype=tf.float32,
                    trainable=True)
    def call(self, inputs):
        # inputs is a list of tensor of shape [(n_batch, n_feat), ..., (n_batch, n_feat)]
        # expand last dim of each input passed [(n_batch, n_feat, 1), ..., (n_batch, n_feat, 1)]
        inputs = [tf.expand_dims(i, -1) for i in inputs]
        inputs = Concatenate(axis=-1)(inputs) # (n_batch, n_feat, n_inputs)
        weights = tf.nn.softmax(self.W, axis=-1) # (1,1,n_inputs)
        # weights sum up to one on last dim
        return tf.reduce_sum(weights*inputs, axis=-1) # (n_batch, n_feat)

In [2]:
## define variables that will be used to train all networks.
# size of the minibatches containing simulations are passed through the network in each epoch.
batch_size = 250
# number of training iterations (epochs) for the SNP only and the combined networks.
epochs = 100
# number of training iterations (epochs) for the traits only networks.
epochs_traits = 500
# number of scenarios being classified.
num_classes = 4

In [29]:
################################################################################################################################################
#Load data
################################################################################################################################################
# load the traits simulated under the BM model for the 4 scenarios. 
traits_BM = []
traits_BM = np.loadtxt("./traits/traits_BM.txt").reshape(40000,-1,4)
# transform into a NumPy array. 
traits_BM = np.array(traits_BM)

# standard scale the continuous (BM) traits
scalers_BM = {}
for i in range(traits_BM.shape[2]):
    scalers_BM[i] = StandardScaler(copy=False)
    traits_BM[:, :, i] = scalers_BM[i].fit_transform(traits_BM[:, :, i]) 

# load the traits simulated under the OU model for the 4 scenarios. 
traits_OU = []
traits_OU = np.loadtxt("./traits/traits_OU.txt").reshape(40000,-1,4)
# transform into a NumPy array. 
traits_OU = np.array(traits_OU)

# standard scale the continuous (OU) traits
scalers_OU = {}
for i in range(traits_OU.shape[2]):
    scalers_OU[i] = StandardScaler(copy=False)
    traits_OU[:, :, i] = scalers_OU[i].fit_transform(traits_OU[:, :, i]) 

#Add missing individuals in E. balsamifera subsp. balsamifera as 0s
for i in range(traits_BM.shape[0]):
  for m in range(51):
    j = random.randint(19, 98)
    traits_BM[i,j,:] = 0
    
#Add missing individuals in E. balsamifera subsp. balsamifera as 0s
for i in range(traits_OU.shape[0]):
  for m in range(51):
    j = random.randint(19, 98)
    traits_OU[i,j,:] = 0

# load the SNPs simulated for the 4 scenarios. 
u1 = np.load("./trainingSims/Model_1sp.npz")
u2 = np.load("./trainingSims/Model_2spMorph.npz")
u3 = np.load("./trainingSims/Model_2spPhylo.npz")
u4 = np.load("./trainingSims/Model_3sp.npz")

# combine the loaded SNPs in a single NumPy array.
X=np.concatenate((u1['Model_1sp'],u2['Model_2spMorph'],u3['Model_2spPhylo'],u4['Model_3sp']),axis=0)

#transform major alleles in -1 and minor in 1
for arr,array in enumerate(X):
  for idx,row in enumerate(array):
    if np.count_nonzero(row) > len(row)/2:
      X[arr][idx][X[arr][idx] == 1] = -1
      X[arr][idx][X[arr][idx] == 0] = 1
    else:
      X[arr][idx][X[arr][idx] == 0] = -1

# create a label vector in the same order as the simulations.
y=[0 for i in range(len(u1['Model_1sp']))]
y.extend([1 for i in range(len(u2['Model_2spMorph']))])
y.extend([2 for i in range(len(u3['Model_2spPhylo']))])
y.extend([3 for i in range(len(u4['Model_3sp']))])
y = np.array(y)
print (len(X), len(y), len(traits_BM))

# separate 75% of labels, SNP and traits matrices as training set. The other 25% are assigned to the test set. The two sets are shuffled.
ytrain, ytest, xtrain, xtest, traits_BM_train, traits_BM_test  = train_test_split(y,X,traits_BM,test_size=0.25, shuffle=True,stratify=y)

# convert labels to a categorical matrix of binary values (0 or 1). The number of rows is the length of the input vector (number of simulations) and the number of columns is the number of classes (3 scenarios).
ytest = np_utils.to_categorical(ytest, num_classes)
ytrain = np_utils.to_categorical(ytrain, num_classes)
# reshape the traits matrices to input them into the MLP
traits_BM_train=traits_BM_train.reshape((traits_BM_train.shape[0], (traits_BM_train.shape[1]*traits_BM_train.shape[2])))
traits_BM_test=traits_BM_test.reshape((traits_BM_test.shape[0], (traits_BM_test.shape[1]*traits_BM_test.shape[2])))

40000 40000 40000


In [5]:
################################################################################################################################################
#Train the combined SNPs + traits (BM) network
################################################################################################################################################
# Create the MLP, the CNN and the combined models
mlp = create_mlp(traits_BM_train)
cnn = create_cnn(xtest)
combinedInput = LinearW()([mlp.output, cnn.output])

# The final fully-connected layer head will have two dense layers (one relu and one softmax)
x = Dense(50, activation="relu")(combinedInput)
x = Dense(num_classes, activation="softmax")(x)

# The final model accepts numerical data on the MLP input and images on the CNN input, outputting a single value
model = Model(inputs=[mlp.input, cnn.input], outputs=x)

# using Stochastic Gradient Descent as optimizer and a categorical cross-entropy loss function
opt = SGD(learning_rate=0.001)
model.compile(loss=keras.losses.categorical_crossentropy,
	              optimizer=opt,
	              metrics=['accuracy'])

print(model.summary())
# save only the epoch with the highest accuracy in the validation set, by using the model checkpoint
earlyStopping = EarlyStopping(monitor='val_accuracy', patience=150, verbose=0, mode='max', restore_best_weights=True)

# fit the model and record running times
start = time.time()
model.fit([traits_BM_train, xtrain], ytrain, batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=([traits_BM_test, xtest], ytest),callbacks=[earlyStopping])
print (f'Time: {time.time() - start}')

# save the model
model.save(filepath='./TrainedModels/Trained_Comb_Model_1KSNPs_BM.acc.mod')

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 429, 109)]   0           []                               
                                                                                                  
 conv1d (Conv1D)                (None, 427, 250)     81750       ['input_1[0][0]']                
                                                                                                  
 batch_normalization_2 (BatchNo  (None, 427, 250)    1000        ['conv1d[0][0]']                 
 rmalization)                                                                                     
                                                                                                  
 conv1d_1 (Conv1D)              (None, 425, 250)     187500      ['batch_normalization_2[0][

2023-08-28 13:35:05.817466: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ./TrainedModels/Trained_Comb_Model_1KSNPs_BM.acc.mod/assets


In [6]:
################################################################################################################################################
#Train the SNPs only network
################################################################################################################################################

#Create the last layer for the SNP network
xCNN = Dense(num_classes, activation="softmax")(cnn.output)
model = Model(inputs=cnn.input, outputs=xCNN)

# using Stochastic Gradient Descent as optimizer and a categorical cross-entropy loss function
opt = SGD(learning_rate=0.001)
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=opt,
              metrics=['accuracy'])

print(model.summary())

# save only the epoch with the highest accuracy in the validation set, by using the model checkpoint
earlyStopping = EarlyStopping(monitor='val_accuracy', patience=150, verbose=0, mode='max', restore_best_weights=True)

# fit the model and record running times
start = time.time()
model.fit(xtrain, ytrain, batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(xtest, ytest),callbacks=[earlyStopping])
print (f'Time: {time.time() - start}')

# save the model
model.save(filepath='./TrainedModels/Trained_CNN_Model_1KSNPs.acc.mod')

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 429, 109)]        0         
                                                                 
 conv1d (Conv1D)             (None, 427, 250)          81750     
                                                                 
 batch_normalization_2 (Batc  (None, 427, 250)         1000      
 hNormalization)                                                 
                                                                 
 conv1d_1 (Conv1D)           (None, 425, 250)          187500    
                                                                 
 batch_normalization_3 (Batc  (None, 425, 250)         1000      
 hNormalization)                                                 
                                                                 
 conv1d_2 (Conv1D)           (None, 423, 250)          1875

In [30]:
#remove
# Create the MLP, the CNN and the combined models
mlp = create_mlp(traits_BM_train)
cnn = create_cnn(xtest)

In [31]:
################################################################################################################################################
#Train the traits only (BM) network
################################################################################################################################################

#Create the last layer for the traits network
xMLP = Dense(num_classes, activation="softmax")(mlp.output)
model = Model(inputs=mlp.input, outputs=xMLP)
# using Stochastic Gradient Descent as optimizer and a categorical cross-entropy loss function
opt = SGD(learning_rate=0.001)
model.compile(loss=keras.losses.categorical_crossentropy,
          optimizer=opt,
          metrics=['accuracy'])
print(model.summary())
# save only the epoch with the highest accuracy in the validation set, by using the model checkpoint
earlyStopping = EarlyStopping(monitor='val_accuracy', patience=150, verbose=0, mode='max', restore_best_weights=True)
# fit the model and record running times
start = time.time()
model.fit(traits_BM_train, ytrain, batch_size=batch_size,
      epochs=epochs_traits,
      verbose=1,
      validation_data=(traits_BM_test, ytest),callbacks=[earlyStopping])
print (f'Time: {time.time() - start}')

# save the model        
model.save(filepath='./TrainedModels/Trained_Traits_Model_BM.acc.mod')

Model: "model_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_27_input (InputLayer)  [(None, 436)]            0         
                                                                 
 dense_27 (Dense)            (None, 150)               65400     
                                                                 
 batch_normalization_15 (Bat  (None, 150)              600       
 chNormalization)                                                
                                                                 
 dense_28 (Dense)            (None, 150)               22500     
                                                                 
 batch_normalization_16 (Bat  (None, 150)              600       
 chNormalization)                                                
                                                                 
 dense_29 (Dense)            (None, 50)                755

In [12]:
################################################################################################################################################
#Train the combined SNPs + traits (OU) network
################################################################################################################################################

# separate 75% of labels, SNP and traits matrices as training set. The other 25% are assigned to the test set. The two sets are shuffled.
ytrain, ytest, xtrain, xtest, traits_OU_train, traits_OU_test  = train_test_split(y,X,traits_OU,test_size=0.25, shuffle=True,stratify=y)

# convert labels to a categorical matrix of binary values (0 or 1). The number of rows is the length of the input vector (number of simulations) and the number of columns is the number of classes (3 scenarios).
ytest = np_utils.to_categorical(ytest, num_classes)
ytrain = np_utils.to_categorical(ytrain, num_classes)
# reshape the traits matrices to input them into the MLP
traits_OU_train=traits_OU_train.reshape((traits_OU_train.shape[0], (traits_OU_train.shape[1]*traits_OU_train.shape[2])))
traits_OU_test=traits_OU_test.reshape((traits_OU_test.shape[0], (traits_OU_test.shape[1]*traits_OU_test.shape[2])))

# Create the MLP, the CNN and the combined models
mlp = create_mlp(traits_OU_train)
cnn = create_cnn(xtest)
combinedInput = LinearW()([mlp.output, cnn.output])

# The final fully-connected layer head will have two dense layers (one relu and one softmax)
x = Dense(50, activation="relu")(combinedInput)
x = Dense(num_classes, activation="softmax")(x)


# The final model accepts numerical data on the MLP input and images on the CNN input, outputting a single value
model = Model(inputs=[mlp.input, cnn.input], outputs=x)

# using Stochastic Gradient Descent as optimizer and a categorical cross-entropy loss function
opt = SGD(learning_rate=0.001)
model.compile(loss=keras.losses.categorical_crossentropy,
	              optimizer=opt,
	              metrics=['accuracy'])

print(model.summary())

# save only the epoch with the highest accuracy in the validation set, by using the model checkpoint
earlyStopping = EarlyStopping(monitor='val_accuracy', patience=150, verbose=0, mode='max', restore_best_weights=True)

# fit the model and record running times
start = time.time()
model.fit([traits_OU_train, xtrain], ytrain, batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=([traits_OU_test, xtest], ytest),callbacks=[earlyStopping])
print (f'Time: {time.time() - start}')

# save the model
model.save(filepath='./TrainedModels/Trained_Comb_Model_1KSNPs_OU.acc.mod')

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 429, 109)]   0           []                               
                                                                                                  
 conv1d_3 (Conv1D)              (None, 427, 250)     81750       ['input_2[0][0]']                
                                                                                                  
 batch_normalization_7 (BatchNo  (None, 427, 250)    1000        ['conv1d_3[0][0]']               
 rmalization)                                                                                     
                                                                                                  
 conv1d_4 (Conv1D)              (None, 425, 250)     187500      ['batch_normalization_7[0][

In [13]:
################################################################################################################################################
#Train the traits only (OU) network
################################################################################################################################################
#Create the last layer for the traits network
xMLP = Dense(num_classes, activation="softmax")(mlp.output)
model = Model(inputs=mlp.input, outputs=xMLP)
# using Stochastic Gradient Descent as optimizer and a categorical cross-entropy loss function
opt = SGD(learning_rate=0.001)
model.compile(loss=keras.losses.categorical_crossentropy,
          optimizer=opt,
          metrics=['accuracy'])
print(model.summary())
# save only the epoch with the highest accuracy in the validation set, by using the model checkpoint
earlyStopping = EarlyStopping(monitor='val_accuracy', patience=150, verbose=0, mode='max', restore_best_weights=True)
# fit the model and record running times
start = time.time()
model.fit(traits_OU_train, ytrain, batch_size=batch_size,
      epochs=epochs_traits,
      verbose=1,
      validation_data=(traits_OU_test, ytest),callbacks=[earlyStopping])
print (f'Time: {time.time() - start}')

# save the model
model.save(filepath='./TrainedModels/Trained_Traits_Model_OU.acc.mod')

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10_input (InputLayer)  [(None, 436)]            0         
                                                                 
 dense_10 (Dense)            (None, 150)               65400     
                                                                 
 batch_normalization_5 (Batc  (None, 150)              600       
 hNormalization)                                                 
                                                                 
 dense_11 (Dense)            (None, 150)               22500     
                                                                 
 batch_normalization_6 (Batc  (None, 150)              600       
 hNormalization)                                                 
                                                                 
 dense_12 (Dense)            (None, 50)                7550

In [32]:
#################################################################################################################################################
##Perform cross-validation
#################################################################################################################################################
# Now that the models are trained, we will evaluate their accuracy based on the test set. For that, we will build confusion matrices containing the true and predicted scenarions for each simulation on the test set.

# first import the libraries
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from keras.models import load_model
from sklearn.metrics import confusion_matrix

# load the trained models.
model1 = load_model('./TrainedModels/Trained_Comb_Model_1KSNPs_BM.acc.mod')
model2 = load_model('./TrainedModels/Trained_CNN_Model_1KSNPs.acc.mod')
model3 = load_model('./TrainedModels/Trained_Traits_Model_BM.acc.mod')
model4 = load_model('./TrainedModels/Trained_Comb_Model_1KSNPs_OU.acc.mod')
model5 = load_model('./TrainedModels/Trained_Traits_Model_OU.acc.mod')

In [33]:
# load the traits simulated under the BM model for the 4 scenarios.
traits_BM = []
traits_BM = np.loadtxt("./testSims/traits/traits_BM.txt").reshape(4000,-1,4)
# transform into a NumPy array. 
traits_BM = np.array(traits_BM)

#Use standard scaling for the continuous (BM) traits.
for i in range(traits_BM.shape[2]):
    traits_BM[:, :, i] = scalers_BM[i].transform(traits_BM[:, :, i]) 

# load the traits simulated under the OU model for the 4 scenarios.
traits_OU = []
traits_OU = np.loadtxt("./testSims/traits/traits_OU.txt").reshape(4000,-1,4)
# transform into a NumPy array. 
traits_OU = np.array(traits_OU)

#Use standard scaling for the continuous (OU) traits.
for i in range(traits_OU.shape[2]):
    traits_OU[:, :, i] = scalers_OU[i].transform(traits_OU[:, :, i]) 

#Add missing individuals in E. balsamifera subsp. balsamifera as 0s
for i in range(traits_BM.shape[0]):
  for m in range(51):
    j = random.randint(19, 98)
    traits_BM[i,j,:] = 0
    
#Add missing individuals in E. balsamifera subsp. balsamifera as 0s
for i in range(traits_OU.shape[0]):
  for m in range(51):
    j = random.randint(19, 98)
    traits_OU[i,j,:] = 0

# load the SNPs simulated for the 4 scenarios. 
u1 = np.load("./testSims/Model_1sp.npz")
u2 = np.load("./testSims/Model_2spMorph.npz")
u3 = np.load("./testSims/Model_2spPhylo.npz")
u4 = np.load("./testSims/Model_3sp.npz")

# combine the loaded SNPs in a single NumPy array.
xtest=np.concatenate((u1['Model_1sp'],u2['Model_2spMorph'],u3['Model_2spPhylo'],u4['Model_3sp']),axis=0)

#transform major alleles in -1 and minor in 1
for arr,array in enumerate(xtest):
  for idx,row in enumerate(array):
    if np.count_nonzero(row) > len(row)/2:
      xtest[arr][idx][xtest[arr][idx] == 1] = -1
      xtest[arr][idx][xtest[arr][idx] == 0] = 1
    else:
      xtest[arr][idx][xtest[arr][idx] == 0] = -1

#Add missing data as 0s, according to a specifies missing data percentage
#46,761 SNP datapoints and 8,287 missing genotypes = 17.7%
missD_perc = 17.7
missD = int(xtest.shape[1]*xtest.shape[2]*(missD_perc/100))
for i in range(xtest.shape[0]):
  for m in range(missD):
    j = random.randint(0, xtest.shape[1] - 1)
    k = random.randint(0, xtest.shape[2] - 1)
    xtest[i][j][k] = 0

# create a label vector in the same order as the simulations.
ytest=[0 for i in range(len(u1['Model_1sp']))]
ytest.extend([1 for i in range(len(u2['Model_2spMorph']))])
ytest.extend([2 for i in range(len(u3['Model_2spPhylo']))])
ytest.extend([3 for i in range(len(u4['Model_3sp']))])
ytest = np.array(ytest)

In [34]:
# Now we will print the confusion matrices for each trained model
#first get the predictions
pred = model1.predict([traits_BM.reshape(4000,-1), xtest])
pred_cat = [i.argmax() for i in pred]
ytest_cat = [i.argmax() for i in ytest]
# Print the confusion matrix
print (confusion_matrix(ytest, pred_cat))

# get the predictions
pred = model2.predict(xtest)
pred_cat = [i.argmax() for i in pred]
ytest_cat = [i.argmax() for i in ytest]
# Print the confusion matrix
print (confusion_matrix(ytest, pred_cat))

# get the predictions
pred = model3.predict(traits_BM.reshape(4000,-1))
pred_cat = [i.argmax() for i in pred]
ytest_cat = [i.argmax() for i in ytest]
# Print the confusion matrix
print (confusion_matrix(ytest, pred_cat))

# get the predictions
pred = model4.predict([traits_OU.reshape(4000,-1), xtest])
pred_cat = [i.argmax() for i in pred]
ytest_cat = [i.argmax() for i in ytest]
# Print the confusion matrix
print (confusion_matrix(ytest, pred_cat))

# get the predictions
pred = model5.predict(traits_OU.reshape(4000,-1))
pred_cat = [i.argmax() for i in pred]
ytest_cat = [i.argmax() for i in ytest]
# Print the confusion matrix
print (confusion_matrix(ytest, pred_cat))

[[1000    0    0    0]
 [   5  995    0    0]
 [  78    0  920    2]
 [  16   13    2  969]]
[[1000    0    0    0]
 [   7  993    0    0]
 [  35    0  965    0]
 [   9   10    5  976]]
[[864  42  78  16]
 [ 60 871   9  60]
 [ 77  13 814  96]
 [ 38 100 147 715]]
[[1000    0    0    0]
 [   1  999    0    0]
 [  29    0  971    0]
 [   5    4    1  990]]
[[860  36  82  22]
 [ 60 849   7  84]
 [100  22 759 119]
 [ 45 100 152 703]]


In [35]:
################################################################################################################################################
#Predict empirical data
################################################################################################################################################
# load the empirical SNPs data. 
inSNPs=np.loadtxt("./input_SNPs.txt")
# transform into a NumPy array. 
EmpSNPs = np.array(inSNPs)

#transform major alleles in -1 and minor in 1
for idx,row in enumerate(EmpSNPs):
  if np.count_nonzero(row==1) > np.count_nonzero(row==-1):
    EmpSNPs[idx][EmpSNPs[idx] == 1] = 9
    EmpSNPs[idx][EmpSNPs[idx] == -1] = 1
    EmpSNPs[idx][EmpSNPs[idx] == 9] = -1

#create 100 randomly resampled datasets from the original SNP matrices
EmpSNPs = np.repeat(EmpSNPs[np.newaxis, :, :], 100, axis=0)

# load the continuous traits from each putative species.
ade=np.genfromtxt("./input_traits_ade.txt", delimiter="\t", filling_values=0)
bal=np.genfromtxt("./input_traits_bal.txt", delimiter="\t", filling_values=0)
sep=np.genfromtxt("./input_traits_sep.txt", delimiter="\t", filling_values=0)

# transform into a NumPy array. 
ade=np.array(ade)
bal=np.array(bal)
sep=np.array(sep)

# create 100 randomly resampled datasets from the original traits
res = []
for i in range(0,100):
  idx_ade = np.random.choice(ade.shape[0], 19, replace=False) # sample 19 random individuals from ade
  n = ade[idx_ade,:]
  idx_bal = np.random.choice(bal.shape[0], 29, replace=False) # sample 29 random individuals from bal
  n_z = np.zeros((80,4)) # create missing data for 80 samples (bal)
  for i in range(len(idx_bal)):
    idx_nz=np.random.choice(80, len(idx_bal), replace=False) # add the 29 avaiable samples for bal in random positions of the created missing data 
    n_z[idx_nz[i]] = bal[idx_bal[i],:]
  n = np.concatenate((n,n_z), axis=0) # combine ade and bal
  idx_sep = np.random.choice(sep.shape[0], 10, replace=False) # sample 10 random individuals from sep
  n = np.concatenate((n,sep[idx_sep,:]), axis=0) # combine ade, bal and sep
  # append into a NumPy array. 
  res.append(np.array(n)) 

# load into a NumPy array.
traits = np.array(res)

#Use standard scaling for the continuous (BM) traits.
for i in range(traits.shape[2]):
    traits[:, :, i] = scalers_BM[i].transform(traits[:, :, i])

# transform into a NumPy array.
emp_traits_BM = np.array(traits)

# load into a NumPy array.
traits = np.array(res)

#Use standard scaling for the continuous (OU) traits.
for i in range(traits.shape[2]):
    traits[:, :, i] = scalers_OU[i].transform(traits[:, :, i])

# transform into a NumPy array.
emp_traits_OU = np.array(traits)

In [36]:
# Perform predictions for the empirical data

# Combined SNPs and BM traits
Emp_CombBM_pred = model1.predict([emp_traits_BM.reshape(100,-1),EmpSNPs])

np.savetxt("Pred_Emp_Comb_BM_Predictions.txt", Emp_CombBM_pred)

# Only SNPs
Emp_SNP_pred = model2.predict(EmpSNPs)

np.savetxt("Pred_Emp_SNP_Predictions.txt", Emp_SNP_pred)

# only BM traits
Emp_traitsBM_pred = model3.predict(emp_traits_BM.reshape(100,-1))

np.savetxt("Pred_Emp_traits_BM_Predictions.txt", Emp_traitsBM_pred)

# Combined SNPs and OU traits
Emp_CombOU_pred = model4.predict([emp_traits_OU.reshape(100,-1),EmpSNPs])

np.savetxt("Pred_Emp_Comb_OU_Predictions.txt", Emp_CombOU_pred)

# only OU traits
Emp_traitsOU_pred = model5.predict(emp_traits_OU.reshape(100,-1))

np.savetxt("Pred_Emp_traits_OU_Predictions.txt", Emp_traitsOU_pred)