# 2. Preparing Data for CNN Training

In [1]:
from glob import glob
from ete3 import Tree
from random import shuffle
import numpy as np

In [5]:
ordered_pairs = [
    ("1","2"),
    ("3","4"),
    ("1","3"),
    ("2","3"),
    ("1","4"),
    ("2","4"),
    ("1","5"),
    ("2","5"),
    ("3","5"),
    ("4","5")
]

In [6]:
def process_sim_data(sim_name, max_gt):
    trees = glob("{0}/{0}_recomb*.tre".format(sim_name))
    nsims = len(trees)
    X = np.zeros((nsims,max_gt,10))
    i = 0
    print("Processing {} trees ({} simulations)...".format(sim_name,nsims))
    for file in trees:
        coals = {op : [] for op in ordered_pairs}
        with open(file) as f:
            for line in f:
                line = line.strip()
                line = line.split("]")[1]
                t = Tree(line)
                for pr in ordered_pairs:
                    coals[pr].append(t.get_distance(pr[0],pr[1])/2.0)
        mat = np.array([coals[k] for k in ordered_pairs]).T
        mat_mean = np.mean(mat)
        mat_std  = np.std(mat)
        X[i,:mat.shape[0],:] = (mat-mat_mean)/mat_std
        i += 1
    return X

No we will run the simulated data to generate the input images. We can get the maximum number of trees by looking at the line counts in each of the simulated gene tree files using `wc -l *.tre | sort | tail` in each of the folders with simulated data. For these data, the maximum number of gene trees is 394.

In [5]:
no_hyb = process_sim_data("no_hyb",394)
np.savez_compressed('no_hyb_recomb.npz', X=no_hyb)

Processing no_hyb trees (50000 simulations)...


In [6]:
one_hyb = process_sim_data("one_hyb",394)
np.savez_compressed('one_hyb_recomb.npz', X=one_hyb)

Processing one_hyb trees (50000 simulations)...


In [7]:
hyb_mig = process_sim_data("hyb_mig",394)
np.savez_compressed('hyb_mig_recomb.npz', X=hyb_mig)

Processing hyb_mig trees (50000 simulations)...


In [8]:
two_hyb = process_sim_data("two_hyb",394)
np.savez_compressed('two_hyb_recomb.npz', X=two_hyb)

Processing two_hyb trees (50000 simulations)...


In [2]:
X_all = np.concatenate((np.load('no_hyb_recomb.npz')["X"],
                        np.load('one_hyb_recomb.npz')["X"],
                        np.load('hyb_mig_recomb.npz')["X"],
                        np.load('two_hyb_recomb.npz')["X"]),axis=0)
print(X_all.shape)
y_all = np.stack((np.repeat((1,0,0,0), 50000),
                  np.repeat((0,1,0,0), 50000),
                  np.repeat((0,0,1,0), 50000),
                  np.repeat((0,0,0,1), 50000)), axis=1)

(200000, 394, 10)


Finally, we will shuffle the indices to randomize the order of the different simulated models and divide
everything into a training, validation, and test data set.

In [4]:
shf = list(range(200000))
shuffle(shf)
X_shf = X_all[shf,:,:]
y_shf = y_all[shf,:]
xtrain, xval, xtest = X_shf[:140000,:,:], X_shf[140000:170000,:,:], X_shf[170000:,:,:]
ytrain, yval, ytest = y_shf[:140000,:], y_shf[140000:170000,:], y_shf[170000:,:]
np.savez_compressed(
    'all_sims_recomb.npz',
    xtrain=xtrain,
    xval=xval,
    xtest=xtest,
    ytrain=ytrain,
    yval=yval,
    ytest=ytest
)

In [5]:
X_hyb = np.concatenate((np.load('no_hyb_recomb.npz')["X"],
                        np.load('one_hyb_recomb.npz')["X"],
                        np.load('two_hyb_recomb.npz')["X"]),axis=0)
print(X_hyb.shape)
y_hyb = np.stack((np.repeat((1,0,0), 50000),
                  np.repeat((0,1,0), 50000),
                  np.repeat((0,0,1), 50000)), axis=1)

(150000, 394, 10)


In [6]:
hshf = list(range(150000))
shuffle(hshf)
X_shyb = X_hyb[hshf,:,:]
y_shyb = y_hyb[hshf,:]
np.savez_compressed(
    'hyb_sims_recomb.npz',
    xtrain=X_shyb[:105000,:,:],
    xval=X_shyb[105000:127500,:,:],
    xtest=X_shyb[127500:,:,:],
    ytrain=y_shyb[:105000,:],
    yval=y_shyb[105000:127500,:],
    ytest=y_shyb[127500:,:]
)

Let's also create a gzipped tarball of the simulated gene trees since we are done processing them:

In [15]:
%%bash
cd no_hyb && tar czf no_hyb_recomb.tar.gz no_hyb_recomb*.tre && cd ..
cd one_hyb && tar czf one_hyb_recomb.tar.gz one_hyb_recomb*.tre && cd ..
cd hyb_mig && tar czf hyb_mig_recomb.tar.gz hyb_mig_recomb*.tre && cd ..
cd two_hyb && tar czf two_hyb_recomb.tar.gz two_hyb_recomb*.tre && cd ..