# **Script and outputs for training a CNN to peform parameter estimation in the species *Euphorbia segueriana* (scripts for the remaining species are similar).** 
From the manuscript Kirschner & Perez et al. "Congruent evolutionary responses of European steppe biota to late Quaternary climate change: insights from convolutional neural network-based demographic modeling".

In [None]:
# Import all required modules.
import sys, os
import numpy as np
import keras
import random
from keras.models import Model
from keras.regularizers import l2
from keras.constraints import max_norm
from keras.layers import Input, Dense, Dropout, Flatten, BatchNormalization
from keras.layers.merge import concatenate
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.pooling import AveragePooling1D, AveragePooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras import backend as K
from keras.models import load_model
import time
from random import shuffle, choice

# Define parameters for the CNN run.
batch_size = 500
epochs = 500

# Define a function to read the parameters file.
def readDemogParams(demogParamPath):
    params = []
    first = True
    with open(demogParamPath) as demogParamFile:
        for line in demogParamFile:
            params.append([float(x) for x in line.strip().split()])
    return params
	
# Define the CNN architecture.
def create_cnn(xtest):
	#use different filter sizes.
	filters = [3,5,20,50]
	inputShape = (imgRows, imgCols)
	inputs = Input(shape=inputShape)
	x = inputs
	x = Conv1D(250, kernel_size=2, activation='relu',input_shape=(xtest.shape[1], xtest.shape[2]))(x)
	x = BatchNormalization()(x)
	for f in range(len(filters)):
		x = Conv1D(20, kernel_size=filters[f], activation='relu',kernel_constraint=max_norm(3), bias_constraint=max_norm(3))(x)
		x = BatchNormalization()(x)
	x = Conv1D(125, kernel_size=2, activation='relu',kernel_constraint=max_norm(3), bias_constraint=max_norm(3))(x)
	x = BatchNormalization()(x)
	x = Conv1D(125, kernel_size=2, activation='relu',kernel_constraint=max_norm(3), bias_constraint=max_norm(3))(x)
	x = AveragePooling1D(pool_size=2)(x)
	x = Flatten()(x)
	x = Dense(125, activation='relu',kernel_regularizer=l2(1e-3), bias_regularizer=l2(1e-3))(x)
	x = Dropout(0.5)(x)
	x = Dense(125, activation='relu',kernel_regularizer=l2(1e-3), bias_regularizer=l2(1e-3))(x)
	x = Dropout(0.5)(x)
	# The last layer is a dense according to the number of parameters.
	x = Dense(numParams)(x)

	# Construct the CNN.
	model = Model(inputs, x)
	# Return the CNN.
	return model

## **Train the network with 10,000 simulations from the selected model**
Use the simulated dataset to train the network, by splitting the data with 75% of simulations for training and 25% for validation.

In [None]:
# Load Numpy arrays containing simulations.
x = np.load("./trainingSims/simModel3.npy",mmap_mode='r')
x = np.array(x)
imgRows, imgCols = x.shape[1:]

# Load parameters.
demogParams = readDemogParams('parameters3.txt')
y = np.array(demogParams)
numParams=y.shape[1]

#delete temporary files to free memory.
del (demogParams)

# Print label and simulations length, these should be the same.
print (len(x), len(y))

# Shuffle the arrays for training, keeping the labels in the same order.
shf = list(range(len(x)))
shuffle(shf)
y = y[shf]
x = x[shf]

# Convert the reference allele to -1.
x[x == 0] = -1

#Add missing data (coded as 0s) to the simulated matrices (with a percentage according to the empirical data - 15% in E. segueriana).
missD = int(x.shape[1]*x.shape[2]*.15)
for i in range(x.shape[0]):
  for m in range(missD):
    j = random.randint(0, x.shape[1] - 1)
    k = random.randint(0, x.shape[2] - 1)
    x[i][j][k] = 0
del(missD)

# Standardize parameters before training.
yMeans=np.mean(y, axis=0)
yStds=np.std(y, axis=0)
y = (y-yMeans)/yStds

# Print parameters means and std deviations.
print (yMeans)
print (yStds)

# Separate train (75%) and validate (25%) sets.
xtrain, xtest = x[int(len(y)*.25):], x[:int(len(y)*.25)]
ytrain, ytest = y[int(len(y)*.25):], y[:int(len(y)*.25)]
del(x)

# Create the CNN network.
cnn = create_cnn(xtest)

# Compile the CNN.
cnn.compile(loss='mean_squared_error',
	              optimizer='Adam')

# Check the architecture.
cnn.summary()

# Run the CNN with early stopping and reducing the learning rate after reaching a plateau. Save the model with the best val_accuracy.
earlyStopping = EarlyStopping(monitor='val_loss', patience=150, verbose=0, mode='max', restore_best_weights=True)

cnn.fit(xtrain, ytrain, batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(xtest, ytest),callbacks=[earlyStopping])

# Save the model.
with open('Trained_Params_10KSims.acc.mod', "w+") as modFile:
    modFile.write(cnn.to_json())

30000 30000
[2.86208248e-01 1.34735301e+06 6.16353922e+04 5.97829560e+03 1.20255566e+05 2.61763230e+00 5.27521489e+01 7.49064630e-01 2.54971359e+00 2.51017993e+00 2.52481896e+00 2.49583994e+00]
[1.09124240e-01 6.05808853e+05 2.81277033e+04 3.48151789e+03 4.58505224e+04 1.36959349e+00 2.73631598e+01 1.45081116e-01 1.43927692e+00 1.44919707e+00 1.43801672e+00 1.44210643e+00]
Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1000, 270)]       0         
_________________________________________________________________
conv1d (Conv1D)              (None, 999, 250)          135250    
_________________________________________________________________
batch_normalization (BatchNo (None, 999, 250)          1000      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 997, 20)           15020     
________

## **Perform parameter predictions with 10,000 simulations from the test set.**

In [None]:
# Load the simulations.
x_test = np.load("./testSims/simModel3.npy",mmap_mode='r')
x_test = np.array(x_test[:,0:1000,:])

# Convert the reference allele to -1.
x_test[x_test == 0] = -1

#Add missing data (coded as 0s) to the simulated matrices (with a percentage according to the empirical data - 15% in E. segueriana).
missD = int(x_test.shape[1]*x_test.shape[2]*.15)
for i in range(x_test.shape[0]):
  for m in range(missD):
    j = random.randint(0, x_test.shape[1] - 1)
    k = random.randint(0, x_test.shape[2] - 1)
    x_test[i][j][k] = 0
del(missD)

# Predict parameters for each simulation.
pred = cnn.predict(x_test)

# Save the obtained predictions.
np.savetxt("testSet_ParameterPredictions.txt", pred)

## **Predict parameters using the empirical data and the trained CNN.**

In [None]:
# Load empirical data.
infile=np.loadtxt("input_Esegueriana.txt")
inp=np.array(infile)

# Create 100 subsets containing 1,000 random SNPs from the full empirical data.
num_samples=100
res = []
for i in range(0,num_samples):
	idx = np.random.choice(inp.shape[0], 1000, replace=False)
	n = inp[idx,:]
	res.append(np.array(n))

# Predict parameters.
Emp_pred = np.array(res)
Emp_pred = cnn.predict(Emp_pred)
print(Emp_pred)

np.savetxt("Emp_ParametersPredictions.txt", Emp_pred)