In [1]:
import pandas as pd
import pyslim
import tskit
import numpy as np

# Parallelizing SLiM simulations in a phylogenetic tree

In [2]:
path_to_tsv = "phylo.tsv"
path_to_make = "parallel_sims.make"
path_to_slimscript = "simulate_branch.slim"
path_to_unioned = "unioned.trees"

## Simulating the branches

### Creating the Makefile to run simulations

In the paper, we just wrote down the Makefile.
For a more complicated phylogeny we might want to generate it programatically.
Here's a way to do it, using the data in this tsv file:

In [3]:
! cat phylo.tsv

child	parent	popsize	edgelen
root		500	2000
C	root	50	250
I	root	100	200
B	I	70	50
A	I	40	50


In [4]:
# Reading the phylogeny data frame
df = pd.read_csv(path_to_tsv, sep="\t")
df = df.fillna('')

## Creating intermediate tree sequences filenames
df["infile"] = df.parent + ".trees"
df["outfile"] = df.child + ".trees"
df.loc[df["infile"]==".trees", "infile"] = ""
df["is_leaf"] = ~df.child.isin(df.parent) # setting nodes that are never parents as leaves

# Writing a Makefile 
f = open(path_to_make, "w")
print(f"all: {' '.join(df.outfile.to_list())}\n", file=f)
print("clean:", file=f)
print(f"\t-rm {' '.join(df.outfile.to_list())}\n", file=f)
for i, row in df.iterrows():
    print(f"{row.outfile}: {row.infile} {path_to_slimscript}", file=f)
    print(f"\tslim -s 123 -d \"infile='{row.infile}'\" -d popsize={row.popsize} "
          f"-d \"popname=\'{row.child}\'\" "
          f"-d num_gens={row.edgelen} " f"-d \"outfile='{row.child}.trees'\" "
          f"{path_to_slimscript}\n",
          file=f)
f.close()

We can verify the result is very similar to what we have in the paper, although takes a different strategy
(it does not include merging):

In [5]:
! cat parallel_sims.make

all: root.trees C.trees I.trees B.trees A.trees

clean:
	-rm root.trees C.trees I.trees B.trees A.trees

root.trees:  simulate_branch.slim
	slim -s 123 -d "infile=''" -d popsize=500 -d "popname='root'" -d num_gens=2000 -d "outfile='root.trees'" simulate_branch.slim

C.trees: root.trees simulate_branch.slim
	slim -s 123 -d "infile='root.trees'" -d popsize=50 -d "popname='C'" -d num_gens=250 -d "outfile='C.trees'" simulate_branch.slim

I.trees: root.trees simulate_branch.slim
	slim -s 123 -d "infile='root.trees'" -d popsize=100 -d "popname='I'" -d num_gens=200 -d "outfile='I.trees'" simulate_branch.slim

B.trees: I.trees simulate_branch.slim
	slim -s 123 -d "infile='I.trees'" -d popsize=70 -d "popname='B'" -d num_gens=50 -d "outfile='B.trees'" simulate_branch.slim

A.trees: I.trees simulate_branch.slim
	slim -s 123 -d "infile='I.trees'" -d popsize=40 -d "popname='A'" -d num_gens=50 -d "outfile='A.trees'" simulate_branch.slim



### Running the simulations in parallel

Now, `make` will magically run things in parallel if possible:

In [6]:
%%bash
make -f parallel_sims.make clean
make -f parallel_sims.make -j 3

rm root.trees C.trees I.trees B.trees A.trees
slim -s 123 -d "infile=''" -d popsize=500 -d "popname='root'" -d num_gens=2000 -d "outfile='root.trees'" simulate_branch.slim
// Initial random seed:
123

// RunInitializeCallbacks():
initializeSLiMModelType(modelType = 'WF');
initializeTreeSeq();
initializeMutationRate(1e-08);
initializeMutationType(1, 0.5, "f", -0.01);
initializeGenomicElementType(1, m1, 0.1);
initializeGenomicElement(g1, 0, 999999);
initializeRecombinationRate(1e-09);

// Starting run at tick <start>:
1 

slim -s 123 -d "infile='root.trees'" -d popsize=50 -d "popname='C'" -d num_gens=250 -d "outfile='C.trees'" simulate_branch.slim
slim -s 123 -d "infile='root.trees'" -d popsize=100 -d "popname='I'" -d num_gens=200 -d "outfile='I.trees'" simulate_branch.slim
// Initial random seed:
123

// RunInitializeCallbacks():
initializeSLiMModelType(modelType = 'WF');
initializeTreeSeq();
initializeMutationRate(1e-08);
initializeMutationType(1, 0.5, "f", -0.01);
initializeGenomicEle

In [7]:
! ls

A.trees		I.trees		     phylo.pdf		   union.py
B.trees		parallelizing.ipynb  phylo.tsv
create_make.py	parallel_sims.make   root.trees
C.trees		phylo.csv	     simulate_branch.slim


## Putting it all together: unioning the tree sequences

Next, we put them back together.

In [8]:
def match_nodes(other, ts, split_time):
    node_mapping = np.full(other.num_nodes, -1, dtype='int')
    ts_ids = np.array([n.metadata["slim_id"] for n in ts.nodes()])
    other_ids = np.array([n.metadata["slim_id"] for n in other.nodes()])
    alive_before_split1 = (other.tables.nodes.time >= split_time)
    shared_id = np.isin(other_ids, ts_ids)
    both = np.logical_and(alive_before_split1, shared_id)
    sorted_ids = np.argsort(ts_ids)
    matches = np.searchsorted(
        ts_ids,
        other_ids[both],
        side='left',
        sorter=sorted_ids
    )
    node_mapping[both] = sorted_ids[matches]
    return node_mapping

In [9]:
def union_children(parent, df, merged):
    print(f"Going in: {parent}")
    child_rows = df[df.parent == parent]
    assert (len(child_rows) == 2) or (len(childs) == 0)
    if len(child_rows) == 2:
        children = [row.child for _, row in child_rows.iterrows()]
        for child in children:
            if child not in merged:
                union_children(child, df, merged)
        split_time = merged[children[0]]["depth"]
        assert split_time == merged[children[1]]["depth"] # ultrametric
        print(f'Unioning: {children}, Split time: {split_time}')
        ts0 = merged[children[0]]["ts"]
        ts1 = merged[children[1]]["ts"]
        node_map = match_nodes(ts1, ts0, split_time)
        tsu = ts0.union(ts1, node_map, check_shared_equality=True)
        # the time from tip to start of simulation is split_time plus the
        # length of the edge
        parent_edgelength = df[df.child==parent].edgelen.item()
        merged[parent] = {
            "ts": tsu,
            "depth": split_time + parent_edgelength,
            "children": merged[children[0]]["children"] + merged[children[1]]["children"]
        }

In [10]:
pops = ['A', 'B', 'C']
A, B, C = (tskit.load(f"{x}.trees") for x in pops)
nm = match_nodes(B, A, split_time=50)
AB = A.union(B, nm)
nm = match_nodes(C, AB, split_time=200)
ABC = AB.union(C, nm)

In [11]:
alive = np.where(np.isclose(ABC.tables.nodes.time, 0))[0]
pop_ids = {}
for pop in ABC.populations():
    if pop.metadata is not None:
        pop_ids[pop.metadata['name']] = pop.id

for name in pops:
    pop_samples = ABC.samples(pop_ids[name])
    n_samples = sum(np.isin(pop_samples, alive)) // 2
    print(f"Union-ed tree sequence has {n_samples} samples in population {name},\n"
          f"\tand we specified {df[df.child==name].popsize.item()} individuals in our simulations.")
    assert n_samples == df[df.child==name].popsize.item()

Union-ed tree sequence has 40 samples in population A,
	and we specified 40 individuals in our simulations.
Union-ed tree sequence has 70 samples in population B,
	and we specified 70 individuals in our simulations.
Union-ed tree sequence has 50 samples in population C,
	and we specified 50 individuals in our simulations.


The final result is `ABC`:

In [12]:
ABC

Tree Sequence,Unnamed: 1
Trees,11
Sequence Length,1000000.0
Time Units,ticks
Sample Nodes,1520
Total Size,359.2 KiB
Metadata,dict  SLiM:  dict  chromosomes:  list  dict  id: 1 index: 0 symbol: A type: A  cycle: 2251 file_version: 0.9 model_type: WF name: sim nucleotide_based: False separate_sexes: False spatial_dimensionality: spatial_periodicity: stage: late  this_chromosome:  dict  id: 1 index: 0 symbol: A type: A  tick: 2251

Table,Rows,Size,Has Metadata
Edges,2810,87.8 KiB,
Individuals,760,76.0 KiB,✅
Migrations,0,8 Bytes,
Mutations,143,9.3 KiB,✅
Nodes,2783,102.2 KiB,✅
Populations,6,2.5 KiB,✅
Provenances,5,52.1 KiB,
Sites,143,3.4 KiB,

Provenance Timestamp,Software Name,Version,Command,Full record
"21 July, 2025 at 09:44:42 AM",tskit,0.6.2,union,"Details  dict  schema_version: 1.0.0  software:  dict  name: tskit version: 0.6.2  parameters:  dict  command: union  other:  dict  timestamp:  list  2025-07-21T09:44:42  2025-07-21T09:44:42  record:  list  { ""environment"": { ""os"": { ""machine"": ""x86_64"", ""node"": ""grebe"",...  { ""environment"": { ""os"": { ""machine"": ""x86_64"", ""node"": ""grebe"",...  node_mapping:  list  0  1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29 ... and 1992 more  environment:  dict  os:  dict  system: Linux node: grebe release: 6.12.27-amd64 version: #1 SMP PREEMPT_DYNAMIC Debian 6.12.27-1 (2025-05-06) machine: x86_64  python:  dict  implementation: CPython version: 3.13.5  libraries:  dict  kastore:  dict  version: 2.1.1"
"21 July, 2025 at 09:44:42 AM",tskit,0.6.2,union,"Details  dict  schema_version: 1.0.0  software:  dict  name: tskit version: 0.6.2  parameters:  dict  command: union  other:  dict  timestamp:  list  2025-07-21T09:44:42  2025-07-21T09:44:42  2025-07-21T09:44:42  record:  list  { ""environment"": { ""os"": { ""machine"": ""x86_64"", ""node"": ""grebe"",...  { ""environment"": { ""os"": { ""machine"": ""x86_64"", ""node"": ""grebe"",...  { ""environment"": { ""os"": { ""machine"": ""x86_64"", ""node"": ""grebe"",...  node_mapping:  list  0  1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29 ... and 2424 more  environment:  dict  os:  dict  system: Linux node: grebe release: 6.12.27-amd64 version: #1 SMP PREEMPT_DYNAMIC Debian 6.12.27-1 (2025-05-06) machine: x86_64  python:  dict  implementation: CPython version: 3.13.5  libraries:  dict  kastore:  dict  version: 2.1.1"
"21 July, 2025 at 09:44:42 AM",SLiM,5.0,"['slim', '-s', '123', '-d', ""infile='I.trees'"", '-d', 'popsize=40', '-d', ""popname='A'"", '-d', 'num_gens=50', '-d', ""outfile='A.trees'"", 'simulate_branch.slim']",Details  dict  environment:  dict  os:  dict  machine: x86_64 node: grebe release: 6.12.27-amd64 system: Linux version: #1 SMP PREEMPT_DYNAMIC Debian 6.12.27-1 (2025-05-06)  metadata:  dict  individuals:  dict  flags:  dict  16:  dict  description: the individual was alive at the time the file was written name: SLIM_TSK_INDIVIDUAL_ALIVE  17:  dict  description: the individual was requested by the user to be permanently remembered name: SLIM_TSK_INDIVIDUAL_REMEMBERED  18:  dict  description: the individual was requested by the user to be retained only if its nodes continue to exist in the t... name: SLIM_TSK_INDIVIDUAL_RETAINED  parameters:  dict  command:  list  slim  -s  123  -d  infile='I.trees'  -d  popsize=40  -d  popname='A'  -d  num_gens=50  -d  outfile='A.trees'  simulate_branch.slim  model: // The following constants need to be defined: // - outfile: path to save the resulting tree sequenc... model_hash: d4946ee6382447931c8c05ede4b2cd d0f4a2d5e33033e233066d0f8b42d7 bc6f model_type: WF nucleotide_based: False seed: 123 separate_sexes: False spatial_dimensionality: spatial_periodicity: stage: late  resources:  dict  elapsed_time: 0.015871433 max_memory: 13848576 sys_time: 0.003535 user_time: 0.014143  schema_version: 1.1.0  slim:  dict  cycle: 2251 file_version: 0.9 name: sim tick: 2251  software:  dict  name: SLiM version: 5.0
"21 July, 2025 at 09:44:42 AM",SLiM,5.0,"['slim', '-s', '123', '-d', ""infile='root.trees'"", '-d', 'popsize=100', '-d', ""popname='I'"", '-d', 'num_gens=200', '-d', ""outfile='I.trees'"", 'simulate_branch.slim']",Details  dict  environment:  dict  os:  dict  machine: x86_64 node: grebe release: 6.12.27-amd64 system: Linux version: #1 SMP PREEMPT_DYNAMIC Debian 6.12.27-1 (2025-05-06)  metadata:  dict  individuals:  dict  flags:  dict  16:  dict  description: the individual was alive at the time the file was written name: SLIM_TSK_INDIVIDUAL_ALIVE  17:  dict  description: the individual was requested by the user to be permanently remembered name: SLIM_TSK_INDIVIDUAL_REMEMBERED  18:  dict  description: the individual was requested by the user to be retained only if its nodes continue to exist in the t... name: SLIM_TSK_INDIVIDUAL_RETAINED  parameters:  dict  command:  list  slim  -s  123  -d  infile='root.trees'  -d  popsize=100  -d  popname='I'  -d  num_gens=200  -d  outfile='I.trees'  simulate_branch.slim  model: // The following constants need to be defined: // - outfile: path to save the resulting tree sequenc... model_hash: 8b284195bccaf36929ced825db1444 1b54f455d4cd211af92a9680608bca f080 model_type: WF nucleotide_based: False seed: 123 separate_sexes: False spatial_dimensionality: spatial_periodicity: stage: late  resources:  dict  elapsed_time: 0.041796754 max_memory: 15736832 sys_time: 0.013091 user_time: 0.030547  schema_version: 1.1.0  slim:  dict  cycle: 2201 file_version: 0.9 name: sim tick: 2201  software:  dict  name: SLiM version: 5.0
"21 July, 2025 at 09:44:42 AM",SLiM,5.0,"['slim', '-s', '123', '-d', ""infile=''"", '-d', 'popsize=500', '-d', ""popname='root'"", '-d', 'num_gens=2000', '-d', ""outfile='root.trees'"", 'simulate_branch.slim']",Details  dict  environment:  dict  os:  dict  machine: x86_64 node: grebe release: 6.12.27-amd64 system: Linux version: #1 SMP PREEMPT_DYNAMIC Debian 6.12.27-1 (2025-05-06)  metadata:  dict  individuals:  dict  flags:  dict  16:  dict  description: the individual was alive at the time the file was written name: SLIM_TSK_INDIVIDUAL_ALIVE  17:  dict  description: the individual was requested by the user to be permanently remembered name: SLIM_TSK_INDIVIDUAL_REMEMBERED  18:  dict  description: the individual was requested by the user to be retained only if its nodes continue to exist in the t... name: SLIM_TSK_INDIVIDUAL_RETAINED  parameters:  dict  command:  list  slim  -s  123  -d  infile=''  -d  popsize=500  -d  popname='root'  -d  num_gens=2000  -d  outfile='root.trees'  simulate_branch.slim  model: // The following constants need to be defined: // - outfile: path to save the resulting tree sequenc... model_hash: defce4309a3e2dc5e9940a8390f505 f1ecac99b9d5e52a707d0311a772fc 1eb8 model_type: WF nucleotide_based: False seed: 123 separate_sexes: False spatial_dimensionality: spatial_periodicity: stage: late  resources:  dict  elapsed_time: 0.670357372 max_memory: 17022976 sys_time: 0.00398 user_time: 0.668801  schema_version: 1.1.0  slim:  dict  cycle: 2001 file_version: 0.9 name: sim tick: 2001  software:  dict  name: SLiM version: 5.0
