In [None]:
# COLAB users: this will allow you to run msprime in google colab
!pip install -q condacolab
import condacolab
condacolab.install()
!conda install -c conda-forge msprime
!pip install tskit
!pip install demesdraw

In [None]:
# COLAB users
from google.colab import drive
drive.mount('/content/drive/')

In [1]:
# Task: decide on prior distributions and generate simulations
import os
import csv
import msprime
import tskit
import numpy as np
import pandas as pd

In [17]:
# Function for simulating data under an IM model with parameters:
# Nanc, T_split, N1, N2, mig

def im(params, sample_sizes, seed, reco):
    """Simulate data for 2 populations."""
    assert len(sample_sizes) == 2

    # Extract parameters
    N1 = params.get("N1")
    N2 = params.get("N2")
    T_split = params.get("T_split")
    N_anc = params.get("N_anc")

    # Define population configurations
    population_configurations = [
        msprime.PopulationConfiguration(sample_size=sample_sizes[0], initial_size=N1),
        msprime.PopulationConfiguration(sample_size=sample_sizes[1], initial_size=N2)
    ]

    # Define migration events
    mig = params.get("mig")
    mig_time = T_split / 2  # no migration initially
    if mig >= 0:            # directional (pulse)
        mig_event = msprime.MassMigration(time=mig_time, source=1, destination=0, proportion=abs(mig)) # migration from pop 1 into pop 0 (back in time)
    else:
        mig_event = msprime.MassMigration(time=mig_time, source=0, destination=1, proportion=abs(mig)) # migration from pop 0 into pop 1 (back in time)

    # Define demographic events
    demographic_events = [
        mig_event,
        msprime.MassMigration(time=T_split, source=1, destination=0, proportion=1.0), # move all in deme 1 to deme 0
        msprime.PopulationParametersChange(time=T_split, initial_size=N_anc, population_id=0) # change to ancestral size
    ]

    # Simulate tree sequence
    ts = msprime.simulate(
        population_configurations=population_configurations,
        demographic_events=demographic_events,
        mutation_rate=params.get("mut"),
        length=params.get("length"),
        recombination_rate=reco,
        random_seed=seed
    )

    return ts


In [18]:
# Define some initial parameters
params = {
    "N1": 100000,    # Population 1 size 
    "N2": 10000,     # Population 2 size 
    "T_split": 5000,    # Time of population split
    "N_anc": 7148911,   # Ancestral population size (7,148,911)
    "mut": 3.5e-9,      # Mutation rate, fixed
    "length": 1e4,      # Sequence length, fixed
    "reco": 8.4e-9,     # recombination rate, fixed
    "mig": 0            # migration rate, fixed
}

In [19]:
# Define additional parameters
sample_sizes = [50, 50]  # Sample sizes for two populations
seed = None               # Random seed

## Task 1: define priors

In [9]:
# Output directory
output_directory = "."
# Output file name
output_file = os.path.join(output_directory, "mosquito-task1.csv")

In [10]:
# T_splits: we know that these populations have been separated by at least 100 generations and a major environmental change between these two locations happened around 8,000 generations ago
T_splits = [5000]

# N1/N2: we know from capture-recapture data that population1 is approximately 30x larger than population 2; we also don't have a clear intuition of each magnitude but from previous findings we expect values for population 1 between 50,000 and 200,000
N1s = [60000]
N2s = [2000]

# migration_rates: we expect either complete isolation after the split (rate=0) or pervasive migration (rate=0.1)
migration_rates = [0]


In [11]:
# Open the output file in write mode
with open(output_file, "w", newline="") as csvfile:
    writer = csv.writer(csvfile, delimiter=",")

    # Write header
    writer.writerow(["Sim", "N1", "N2", "T_split", "mig"])

    # Perform sampling
    for i in range(100):
    
        params["N1"] = np.random.randint(low=50000, high=200000, size=1)[0]
        params["N2"] = int(params["N1"] / 30)
        params["mig"] = np.random.choice([0,0.1], size=1)[0]
        params["T_split"] = int(np.random.normal(loc=8000, scale=2000, size=1)[0])

        ts = im(params, sample_sizes, seed, params["reco"])
            
        # Write data to file
        writer.writerow([i+1, params["N1"], params["N2"], params["T_split"], params["mig"]])


In [None]:
# inspect the distributions using R script

In [None]:
simulations = pd.read_csv('/content/drive/MyDrive/mosquito-task1.csv')

## Task 2: simulate and perform ABC

In [20]:
# Output directory
output_directory = "."
# Output file name
output_file = os.path.join(output_directory, "mosquito-task2.csv")

In [21]:
# Open the output file in write mode
with open(output_file, "w", newline="") as csvfile:
    writer = csv.writer(csvfile, delimiter=",")

    # Write header
    writer.writerow(["N1", "N2", "T_split", "mig", "Fst", "dxy", "segsites1", "segsites2", "pi1", "pi2", "tajima1", "tajima2"])

    # Perform simulations
    for i in range(1000):

        params["N1"] = 150_000
        params["N2"] = 5_000
        params["mig"] = 0
        params["T_split"] = int(np.random.normal(loc=8000, scale=2000, size=1)[0])
                        
        ts = im(params, sample_sizes, seed, params["reco"])

        dxy = ts.divergence(sample_sets=[ts.samples(population=0), ts.samples(population=1)])

        Fst = ts.Fst(sample_sets=[ts.samples(population=0), ts.samples(population=1)])

        ssites = ts.segregating_sites(sample_sets=[ts.samples(population=0), ts.samples(population=1)])
        div = ts.diversity(sample_sets=[ts.samples(population=0), ts.samples(population=1)])
        tajima = ts.Tajimas_D(sample_sets=[ts.samples(population=0), ts.samples(population=1)])
    
        # Write data to file or print data
        writer.writerow([params["N1"], params["N2"], params["T_split"], params["mig"], Fst, dxy, ssites[0], ssites[1], div[0], div[1], tajima[0], tajima[1]])


In [None]:
# perform ABC using R script

In [None]:
simulations = pd.read_csv('/content/drive/MyDrive/mosquito-task2.csv')