## **Notebook containing scripts and outputs of the training, cross-validation and empirical data prediction for *Pilosocereus aurisetus***
From the manuscript Perez et al. "Species Delimitation Meets Deep Learning: Insights from a Highly Fragmented Cactus System"


In [None]:
#mount google drive to load files
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
# Import all required modules.
import sys, os
import numpy as np
import keras
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Flatten
from keras.layers.merge import concatenate
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import  AveragePooling1D
from keras import backend as K
from sklearn.neighbors import NearestNeighbors
from random import shuffle
import time

# Define parameters for the CNN run.
batch_size = 250
epochs = 250
num_classes = 5

# Define the CNN architecture.
def create_cnn(xtest, regularizer=None):
	inputShape = (xtest.shape[1], xtest.shape[2])
	inputs = Input(shape=inputShape)
	x = inputs
	x = Conv1D(250, kernel_size=2, activation='relu',input_shape=(xtest.shape[1], xtest.shape[2]))(x)
	x = Conv1D(125, kernel_size=2, activation='relu')(x)
	x = AveragePooling1D(pool_size=2)(x)
	x = Dropout(0.75)(x)
	x = Conv1D(125, kernel_size=2, activation='relu')(x)
	x = AveragePooling1D(pool_size=2)(x)
	x = Dropout(0.75)(x)
	x = Flatten()(x)
	x = Dense(125, activation='relu')(x)
	x = Dropout(0.5)(x)
	x = Dense(125, activation='relu')(x)
	x = Dropout(0.5)(x)
  # The final fully-connected layer head will have a softmax dense layer.
	x = Dense(num_classes, activation="softmax")(x)

	# Construct the CNN.
	model = Model(inputs, x)
	# Return the CNN.
	return model

# **Train the network with 10,000 simulations from each model**
Here we will use the full simulated dataset to train the network, by splitting the data with 75% of simulations for training and 25% for validation.



In [None]:
################################################################################################################################################
#Train a network using 10K simulations per model.
################################################################################################################################################
# Load Numpy arrays containing simulations.
u1 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation_Piloso/trainingSims/Piloso/simModel1.npy",mmap_mode='r')
u2 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation_Piloso/trainingSims/Piloso/simModel2.npy",mmap_mode='r')
u3 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation_Piloso/trainingSims/Piloso/simModel3.npy",mmap_mode='r')
u4 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation_Piloso/trainingSims/Piloso/simModel4.npy",mmap_mode='r')
u5 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation_Piloso/trainingSims/Piloso/simModel5.npy",mmap_mode='r')

# Combine all arrays.
x=np.concatenate((u1,u2,u3,u4,u5),axis=0)

# Label each simulated array.
y=[0 for i in range(len(u1))]
y.extend([1 for i in range(len(u2))])
y.extend([2 for i in range(len(u3))])
y.extend([3 for i in range(len(u4))])
y.extend([4 for i in range(len(u5))])
y = np.array(y)

# Print label and simulations length, these should be the same.
print (len(x), len(y))

# Shuffle the arrays for training, keeping the labels in the same order.
shf = list(range(len(x)))
shuffle(shf)
y = y[shf]
x = x[shf]

# Separate train (75%) and validate (25%) sets.
xtrain, xtest = x[int(len(y)*.25):], x[:int(len(y)*.25)]
ytrain, ytest = y[int(len(y)*.25):], y[:int(len(y)*.25)]
ytest = keras.utils.to_categorical(ytest, num_classes)
ytrain = keras.utils.to_categorical(ytrain, num_classes)

# Create the CNN network, using the architecture defined above.
cnn = create_cnn(xtest)

# Compile the CNN.
cnn.compile(loss=keras.losses.categorical_crossentropy,
	              optimizer=keras.optimizers.Adam(),
	              metrics=['accuracy'])
print(cnn.summary())

# Start a timer to estimate training runtime.
start = time.time()
# Run the CNN.
cnn.fit(xtrain, ytrain, batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(xtest, ytest))
print ('Time: ')
print (time.time() - start)

# Save the trained model
cnn.save(filepath='/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation_Piloso/Trained_10KSims.acc.mod')

50000 50000
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 214, 64)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 213, 250)          32250     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 212, 125)          62625     
_________________________________________________________________
average_pooling1d_1 (Average (None, 106, 125)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 106, 125)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 105, 125)          31375     
_________________________________________________________________
average_pooling1d_2 (Average (None, 52, 125)   

In [None]:
################################################################################################################################################
#Evaluate the CNN trained with 10K simulations per model, using 1,000 simulations per model as test set.
################################################################################################################################################
# Load Numpy arrays containing test set simulations.
t1 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation_Piloso/TestData/Piloso/simModel1.npz")
t2 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation_Piloso/TestData/Piloso/simModel2.npz")
t3 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation_Piloso/TestData/Piloso/simModel3.npz")
t4 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation_Piloso/TestData/Piloso/simModel4.npz")
t5 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation_Piloso/TestData/Piloso/simModel5.npz")
t1 = t1["simModel1"]
t2 = t2["simModel2"]
t3 = t3["simModel3"]
t4 = t4["simModel4"]
t5 = t5["simModel5"]
x=np.concatenate((t1,t2,t3,t4,t5),axis=0)

# Label simulations from the test set.
y=[0 for i in range(len(t1))]
y.extend([1 for i in range(len(t2))])
y.extend([2 for i in range(len(t3))])
y.extend([3 for i in range(len(t4))])
y.extend([4 for i in range(len(t5))])
y = np.array(y)

# Load the trained model.
from keras.models import load_model
from sklearn.metrics import confusion_matrix
model = load_model('/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation_Piloso/Trained_10KSims.acc.mod')

# Predict and export a confusion matrix.
pred = model.predict(x)
pred_cat = [i.argmax() for i in pred]
print (confusion_matrix(y, pred_cat))
print (confusion_matrix(y, pred_cat) / float(len(y)))

[[ 965    0   35    0    0]
 [   0  978    0   13    9]
 [   0    0  999    1    0]
 [   0    0   23  941   36]
 [   0    0    0    0 1000]]
[[0.193  0.     0.007  0.     0.    ]
 [0.     0.1956 0.     0.0026 0.0018]
 [0.     0.     0.1998 0.0002 0.    ]
 [0.     0.     0.0046 0.1882 0.0072]
 [0.     0.     0.     0.     0.2   ]]


In [None]:
################################################################################################################################################
#Predict the mos likely model for the empirical data, using the CNN trained with 10K simulations per model.
################################################################################################################################################
# Load the trained network.
model = load_model('/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/Trained_10KSims.acc.mod')
# Load empirical data and transpose it.
infile=np.loadtxt("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/Input_Piloso.txt")
inp=[]
inp.append(np.array(infile).T)
x = np.array(inp)
# Predict the most likely model.
pred = model.predict(x)
print(pred)

[[5.3287163e-02 9.4671279e-01 0.0000000e+00 1.4628516e-13 4.1710716e-16]]


# **Evaluate the impact of using different number of simulations to train the network**
Below, we repeat the procedures of training and evalutating the CNN, with varying number of simulations (2,500; 1,000 and 500) per model.

In [None]:
################################################################################################################################################
#Train the network with 2.5K simulations per model
################################################################################################################################################

x=np.concatenate((u1[0:2500,:,:],u2[0:2500,:,:],u3[0:2500,:,:],u4[0:2500,:,:],u5[0:2500,:,:]),axis=0)

y=[0 for i in range(len(u1[0:2500,:,:]))]
y.extend([1 for i in range(len(u2[0:2500,:,:]))])
y.extend([2 for i in range(len(u3[0:2500,:,:]))])
y.extend([3 for i in range(len(u4[0:2500,:,:]))])
y.extend([4 for i in range(len(u5[0:2500,:,:]))])
y = np.array(y)

print (len(x), len(y))
shf = list(range(len(x)))
shuffle(shf)

y = y[shf]
x = x[shf]

xtrain, xtest = x[int(len(y)*.25):], x[:int(len(y)*.25)]
ytrain, ytest = y[int(len(y)*.25):], y[:int(len(y)*.25)]



ytest = keras.utils.to_categorical(ytest, num_classes)
ytrain = keras.utils.to_categorical(ytrain, num_classes)

# Create the CNN network
cnn = create_cnn(xtest)


cnn.compile(loss=keras.losses.categorical_crossentropy,
	              optimizer=keras.optimizers.Adam(),
	              metrics=['accuracy'])

print(cnn.summary())

start = time.time()
cnn.fit(xtrain, ytrain, batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(xtest, ytest))
print ('Time: ')
print (time.time() - start)

cnn.save(filepath='/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation_Piloso/Trained_2.5KSims.acc.mod')

12500 12500
Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 214, 64)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 213, 250)          32250     
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 212, 125)          62625     
_________________________________________________________________
average_pooling1d_3 (Average (None, 106, 125)          0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 106, 125)          0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 105, 125)          31375     
_________________________________________________________________
average_pooling1d_4 (Average (None, 52, 125)   

In [None]:
################################################################################################################################################
#Train the network with 1K simulations per model
################################################################################################################################################


x=np.concatenate((u1[0:1000,:,:],u2[0:1000,:,:],u3[0:1000,:,:],u4[0:1000,:,:],u5[0:1000,:,:]),axis=0)



y=[0 for i in range(len(u1[0:1000,:,:]))]
y.extend([1 for i in range(len(u2[0:1000,:,:]))])
y.extend([2 for i in range(len(u3[0:1000,:,:]))])
y.extend([3 for i in range(len(u4[0:1000,:,:]))])
y.extend([4 for i in range(len(u5[0:1000,:,:]))])
y = np.array(y)

print (len(x), len(y))
shf = list(range(len(x)))
shuffle(shf)

y = y[shf]
x = x[shf]

xtrain, xtest = x[int(len(y)*.25):], x[:int(len(y)*.25)]
ytrain, ytest = y[int(len(y)*.25):], y[:int(len(y)*.25)]



ytest = keras.utils.to_categorical(ytest, num_classes)
ytrain = keras.utils.to_categorical(ytrain, num_classes)

# Create the CNN network
cnn = create_cnn(xtest)


cnn.compile(loss=keras.losses.categorical_crossentropy,
	              optimizer=keras.optimizers.Adam(),
	              metrics=['accuracy'])

print(cnn.summary())

start = time.time()
cnn.fit(xtrain, ytrain, batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(xtest, ytest))
print ('Time: ')
print (time.time() - start)

cnn.save(filepath='/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation_Piloso/Trained_1KSims.acc.mod')

5000 5000
Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 214, 64)           0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 213, 250)          32250     
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 212, 125)          62625     
_________________________________________________________________
average_pooling1d_5 (Average (None, 106, 125)          0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 106, 125)          0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 105, 125)          31375     
_________________________________________________________________
average_pooling1d_6 (Average (None, 52, 125)     

In [None]:
################################################################################################################################################
#Train the network with 500 simulations per model
################################################################################################################################################


x=np.concatenate((u1[0:500,:,:],u2[0:500,:,:],u3[0:500,:,:],u4[0:500,:,:],u5[0:500,:,:]),axis=0)



y=[0 for i in range(len(u1[0:500,:,:]))]
y.extend([1 for i in range(len(u2[0:500,:,:]))])
y.extend([2 for i in range(len(u3[0:500,:,:]))])
y.extend([3 for i in range(len(u4[0:500,:,:]))])
y.extend([4 for i in range(len(u5[0:500,:,:]))])
y = np.array(y)

print (len(x), len(y))
shf = list(range(len(x)))
shuffle(shf)

y = y[shf]
x = x[shf]

xtrain, xtest = x[int(len(y)*.25):], x[:int(len(y)*.25)]
ytrain, ytest = y[int(len(y)*.25):], y[:int(len(y)*.25)]



ytest = keras.utils.to_categorical(ytest, num_classes)
ytrain = keras.utils.to_categorical(ytrain, num_classes)

# Create the CNN network
cnn = create_cnn(xtest)


cnn.compile(loss=keras.losses.categorical_crossentropy,
	              optimizer=keras.optimizers.Adam(),
	              metrics=['accuracy'])

print(cnn.summary())

start = time.time()
cnn.fit(xtrain, ytrain, batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(xtest, ytest))
print ('Time: ')
print (time.time() - start)

cnn.save(filepath='/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation_Piloso/Trained_0.5KSims.acc.mod')

2500 2500
Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 214, 64)           0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 213, 250)          32250     
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 212, 125)          62625     
_________________________________________________________________
average_pooling1d_7 (Average (None, 106, 125)          0         
_________________________________________________________________
dropout_13 (Dropout)         (None, 106, 125)          0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 105, 125)          31375     
_________________________________________________________________
average_pooling1d_8 (Average (None, 52, 125)     

In [None]:
################################################################################################################################################
#Evaluate the CNN trained with 2.5K simulations per model, using 1,000 simulations per model as test set.
################################################################################################################################################
t1 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/TestData/Piloso/simModel1.npz")
t2 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/TestData/Piloso/simModel2.npz")
t3 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/TestData/Piloso/simModel3.npz")
t4 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/TestData/Piloso/simModel4.npz")
t5 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/TestData/Piloso/simModel5.npz")
t1 = t1["simModel1"]
t2 = t2["simModel2"]
t3 = t3["simModel3"]
t4 = t4["simModel4"]
t5 = t5["simModel5"]
x=np.concatenate((t1,t2,t3,t4,t5),axis=0)

y=[0 for i in range(len(t1))]
y.extend([1 for i in range(len(t2))])
y.extend([2 for i in range(len(t3))])
y.extend([3 for i in range(len(t4))])
y.extend([4 for i in range(len(t5))])
y = np.array(y)

from keras.models import load_model
from sklearn.metrics import confusion_matrix

model = load_model('/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/Trained_2.5KSims.acc.mod')
pred = model.predict(x)

pred_cat = [i.argmax() for i in pred]
print (confusion_matrix(y, pred_cat))
print (confusion_matrix(y, pred_cat) / float(len(y)))

[[ 966    0   34    0    0]
 [   0  972    0   14   14]
 [   0    0  996    4    0]
 [   0    0   28  914   58]
 [   0    0    0    0 1000]]
[[0.1932 0.     0.0068 0.     0.    ]
 [0.     0.1944 0.     0.0028 0.0028]
 [0.     0.     0.1992 0.0008 0.    ]
 [0.     0.     0.0056 0.1828 0.0116]
 [0.     0.     0.     0.     0.2   ]]


In [None]:
################################################################################################################################################
#Evaluate the CNN trained with 1K simulations per model, using 1,000 simulations per model as test set.
################################################################################################################################################
t1 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/TestData/Piloso/simModel1.npz")
t2 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/TestData/Piloso/simModel2.npz")
t3 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/TestData/Piloso/simModel3.npz")
t4 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/TestData/Piloso/simModel4.npz")
t5 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/TestData/Piloso/simModel5.npz")
t1 = t1["simModel1"]
t2 = t2["simModel2"]
t3 = t3["simModel3"]
t4 = t4["simModel4"]
t5 = t5["simModel5"]
x=np.concatenate((t1,t2,t3,t4,t5),axis=0)

y=[0 for i in range(len(t1))]
y.extend([1 for i in range(len(t2))])
y.extend([2 for i in range(len(t3))])
y.extend([3 for i in range(len(t4))])
y.extend([4 for i in range(len(t5))])
y = np.array(y)

from keras.models import load_model
from sklearn.metrics import confusion_matrix

model = load_model('/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/Trained_1KSims.acc.mod')
pred = model.predict(x)

pred_cat = [i.argmax() for i in pred]
print (confusion_matrix(y, pred_cat))
print (confusion_matrix(y, pred_cat) / float(len(y)))

[[959   0  41   0   0]
 [  4 967   0  20   9]
 [  0   0 998   2   0]
 [  0   0  43 928  29]
 [  0   1   0   3 996]]
[[0.1918 0.     0.0082 0.     0.    ]
 [0.0008 0.1934 0.     0.004  0.0018]
 [0.     0.     0.1996 0.0004 0.    ]
 [0.     0.     0.0086 0.1856 0.0058]
 [0.     0.0002 0.     0.0006 0.1992]]


In [None]:
################################################################################################################################################
#Evaluate the CNN trained with 500 simulations per model, using 1,000 simulations per model as test set.
################################################################################################################################################
t1 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/TestData/Piloso/simModel1.npz")
t2 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/TestData/Piloso/simModel2.npz")
t3 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/TestData/Piloso/simModel3.npz")
t4 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/TestData/Piloso/simModel4.npz")
t5 = np.load("/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/TestData/Piloso/simModel5.npz")
t1 = t1["simModel1"]
t2 = t2["simModel2"]
t3 = t3["simModel3"]
t4 = t4["simModel4"]
t5 = t5["simModel5"]
x=np.concatenate((t1,t2,t3,t4,t5),axis=0)

y=[0 for i in range(len(t1))]
y.extend([1 for i in range(len(t2))])
y.extend([2 for i in range(len(t3))])
y.extend([3 for i in range(len(t4))])
y.extend([4 for i in range(len(t5))])
y = np.array(y)

from keras.models import load_model
from sklearn.metrics import confusion_matrix

model = load_model('/content/drive/My Drive/Colab Notebooks/CNN_SpDelimitation/Trained_0.5KSims.acc.mod')
pred = model.predict(x)

pred_cat = [i.argmax() for i in pred]
print (confusion_matrix(y, pred_cat))
print (confusion_matrix(y, pred_cat) / float(len(y)))

[[962   0  38   0   0]
 [  6 966   0  12  16]
 [  0   0 998   2   0]
 [  0   6  55 869  70]
 [  0   2   0   0 998]]
[[0.1924 0.     0.0076 0.     0.    ]
 [0.0012 0.1932 0.     0.0024 0.0032]
 [0.     0.     0.1996 0.0004 0.    ]
 [0.     0.0012 0.011  0.1738 0.014 ]
 [0.     0.0004 0.     0.     0.1996]]
